def test_Game_update_won(state, expected): # arrange game = Game() game.state = np.reshape(state, game.board_shape) # act game._update_won() # assert assert game.won == expected
def test_Game_determine_reward(won, expected): # arrange game = Game() game.won = won marker = 1 # act reward = game.determine_reward(marker) # assert assert reward == expected
def test_NeuralPlayer_policy(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) marker = 1 state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) game.state = state move_values = agent._policy(marker, game) assert isinstance(move_values, list)
def test_NeuralPlayer_play(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) marker = 1 state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) game.state = state actions = state_to_actions(tuple(state.flatten()), game.ind_to_loc, game.empty_marker) loc = agent.play(marker, game) assert isinstance(loc, tuple) assert loc in actions
def test_Game_mark(loc, marker, expected): # arrange game = Game() prev_turn = 1 game.turn = prev_turn game.state[1, 1] = -1 prev_mark = game.state[loc[0], loc[1]] # act valid, _ = game.mark(loc, marker) expected_turn = int(marker * -1) if valid else prev_turn expected_mark = marker if valid else prev_mark # assert assert valid == expected assert game.turn == expected_turn assert game.state[loc[0], loc[1]] == expected_mark
def test_NeuralPlayer_equivalent_states_to_reward(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) equiv_states, equiv_transforms = agent._equivalent_states_to_reward(state) assert len(equiv_states) == len(equiv_transforms)
def test_NeuralPlayer_adjust_state_for_marker(net, marker, expected): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) expected_mod = np.reshape(expected, game.board_shape) state_mod = agent._adjust_state_for_marker(state, marker) assert (state_mod == expected_mod).all()
def test_NeuralPlayer_state_values(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) expected_len = game.board_shape[0] * game.board_shape[1] values = agent._state_values(state) assert isinstance(values, torch.Tensor) assert len(values) == expected_len
def test_TablePlayer_play(value_map): # arrange player = TablePlayer(value_map) marker = 1 game = Game() # act loc = player.play(marker, game) # assert assert isinstance(loc, tuple)
def test_NeuralPlayer_reward_move(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) marker = 1 move = (2, 1) reward = 1 temp_disc = 1 reward_mods = agent._reward_move(state, marker, move, reward, temp_disc, game.ind_to_loc) assert isinstance(reward_mods, list)
def test_one_hot_state(): game = Game() state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) marker_order = [-1, 0, 1] expected_size = len(marker_order) * state.size expected_ohe = np.array([ 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0 ], dtype=np.int8) ohe = one_hot_state(state, marker_order) assert ohe.size == expected_size assert (ohe == expected_ohe).all()
def test_NeuralPlayer_process_state_reward(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) state = np.reshape((0, 1, -1, 0, 1, 0, -1, 0, 0), game.board_shape) transform = {'func': None, 'args': {}} move = (2, 1) reward = 1 temp_disc = 1 equiv = False mod = agent._process_state_reward(state, transform, move, reward, temp_disc, equiv, game.ind_to_loc) assert isinstance(mod, ValueMod)
def test_NeuralPlayer_update_value_with_reward(net, value, reward): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) temp_disc = 0.5 updated = agent._update_value_with_reward(value, reward, lr, temp_disc) assert updated >= 0 assert updated <= 1 if reward == 0: assert updated == value elif reward > 0: assert updated > value elif reward < 0: assert updated < value
def test_NeuralPlayer_process_reward_lose(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) marker = 1 agent.buffer = [ MoveRecord(state=np.reshape((0, -1, -1, 0, 0, 1, 0, 0, 1), game.board_shape), move=(1, 1), marker=marker) ] reward = -1 reward_mods = agent.process_reward(reward, game.ind_to_loc) assert len(agent.reward_record) > 0
def test_NeuralPlayer_calc_target_values(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) n_vals = game.board_shape[0] * game.board_shape[1] rand_vals = list(np.random.rand(n_vals, 1)) values = torch.tensor(rand_vals, dtype=float) move_ind = 5 valid_inds = [1, 2, 3] current = values[move_ind].item() updated = current * 1.1 targets = agent._calc_target_values(values, current, updated, move_ind, valid_inds) assert np.isclose(torch.sum(targets).item(), 1)
def test_NeuralPlayer_process_reward_no_reward(net): game = Game() lr = 0.25 agent = NeuralPlayer(net, lr) marker = 1 agent.buffer = [ MoveRecord(state=np.reshape((0, -1, -1, 0, 0, 1, 0, 0, 1), game.board_shape), move=(0, 0), marker=marker), MoveRecord(state=np.reshape((1, -1, -1, 0, 0, 1, -1, 0, 1), game.board_shape), move=(2, 1), marker=marker) ] reward = 0 expected_mods = [] reward_mods = agent.process_reward(reward, game.ind_to_loc) assert reward_mods == expected_mods
def initialize_value_map(init_val: float) -> dict: """Initialize a value map. Args: init_val: float, initial value Returns: init_value_map: dict, value map """ prod_combs = product(Game.valid_markers + [Game.empty_marker], repeat=Game.board_shape[0]**2) valid_combs = [pc for pc in prod_combs if abs(sum(pc)) < 2] non_dupes = [] for vc in valid_combs: swap = tuple([elem * -1 for elem in vc]) if swap not in non_dupes: non_dupes.append(vc) combs = [] for nd in non_dupes: c_box = np.reshape(nd, Game.board_shape) rot90 = np.rot90(c_box) if tuple(rot90.flatten()) in combs: continue rot180 = np.rot90(c_box, k=2) if tuple(rot180.flatten()) in combs: continue rot270 = np.rot90(c_box, k=3) if tuple(rot270.flatten()) in combs: continue lr = np.fliplr(c_box) if tuple(lr.flatten()) in combs: continue ud = np.flipud(c_box) if tuple(ud.flatten()) in combs: continue combs.append(nd) # can't have more than one valid won state states = [] for c in combs: game = Game() game.state = np.reshape(c, Game.board_shape) try: game._update_won() states.append(c) except ValueError: pass init_value_map = { s: { m: { a: init_val for a in state_to_actions(s, Game.ind_to_loc, Game.empty_marker) } for m in [-1, 1] } for s in states } for s in init_value_map: game = Game() game.state = np.reshape(s, game.board_shape) game._update_won() for m in init_value_map[s]: # won state: no actions, just reward value if game.won in game.valid_markers: init_value_map[s][m] = 1 if m == game.won else 0 # full board: no actions, just initial value elif len(init_value_map[s][m]) == 0: init_value_map[s][m] = INITIAL_VALUE # cannot be marker's turn: no actions # NOTE: I don't explicitly reverse transform a marker swap # so can't assume markers will match # elif sum(s) == m: # init_value_map[s][m] = {} return init_value_map
previous=current, new=updated) reward_mods.append(mod) return reward_mods if __name__ == '__main__': init_value_map = initialize_value_map(INITIAL_VALUE) agent = TablePlayer(init_value_map) competitor = TablePlayer(init_value_map) # train against a player who is learning how to beat you trains = [] for _ in range(100000): game = Game() play_game(game, agent, competitor) trains.append(game.won) trains_mv = moving_value_frequencies(trains) plot_outcome_frequencies(trains_mv, order=[1, 0, -1], labels=['Agent Wins', 'Tie', 'Competitor Wins']) # test against a random player to see how much we've learned agent.explore = False agent.learning_rate = 0 rando = TablePlayer(init_value_map) rando.learning_rate = 0 tests = []
def net(): return linear_net(Game())
mods = self._reward_move(entry.state, entry.marker, entry.move, reward, temporal_discount, ind_to_loc) reward_mods.extend(mods) temporal_discount *= self.temporal_discount_rate self.reward_record = reward_mods if __name__ == '__main__': lr = 0.25 nn_lr = 1e-3 temp_rate = 0.8 layers = [2, 1, 0.5] # [1] drop_prob = 0.0 # 0.05 agent = NeuralPlayer(linear_net(Game(), hidden_layers=layers, drop_prob=drop_prob), lr=lr, temp_rate=temp_rate, nn_lr=nn_lr) competitor = NeuralPlayer(linear_net(Game(), hidden_layers=layers, drop_prob=drop_prob), lr=lr, temp_rate=temp_rate, nn_lr=nn_lr) rando = RandomPlayer() n = 50000 outcomes = []