def wythoff_dqn2(epsilon=0.1, gamma=0.5, learning_rate=1e-6, num_episodes=100, batch_size=20, memory_capacity=100, game='Wythoff10x10', network='DQN_xy1', anneal=False, tensorboard=None, update_every=5, double=False, double_update=10, save=False, save_model=False, monitor=None, return_none=False, debug=False, device='cpu', clip_grad=False, progress=False, zero=False, seed=None): """Learning Wythoff's, with a DQN.""" # ------------------------------------------------------------------------ # Init num_episodes = int(num_episodes) batch_size = int(batch_size) memory_capacity = int(memory_capacity) update_every = int(update_every) # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) # Is network a nn.Module? if hasattr(network, "forward"): Model = network # Is it the name of a azad model? else: Model = getattr(azad.models, network) player = Model().to(device) target = Model().to(device) if double: target.load_state_dict(player.state_dict()) target.eval() else: target = None if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Device: {device}") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") memory = ReplayMemory(memory_capacity) # optimizer = optim.Adam(player.parameters(), learning_rate) optimizer = optim.SGD(player.parameters(), learning_rate) moves = MoveCount(m, n) opts = OptimalCount(0) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init transitions = [] state = env.reset() x, y, board, available = state moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game steps = 1 done = False while not done: # Choose a move Qs = build_Qs(player, state, available, device=device, mode="numpy") move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy') move = available[move_i] moves.update(move) # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next # Track value statistics total_reward += reward Q = Qs[move_i] prediction_error = Qs.max() - Q advantage = Q - Qs[np.nonzero(Qs)].mean() # Save transitions, as tensors to be used at training time # (onto GPU) transitions.append([ # S torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(), # A torch.tensor(move).unsqueeze(0), # S' torch.tensor( (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(), # R torch.tensor([reward]).unsqueeze(0).float(), ]) # - if debug: print(f">>> position: {(x, y)}") print(f">>> num available: {len(available)}") print(f">>> available: {available}") print(f">>> Qs (filtered): {Qs}") print(f">>> new position: ({x_next}, {y_next})") # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][3] = transitions[-1][3] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions)): memory.push(*transitions[i]) if debug: print(f">>> final transitions: {transitions[-2:]}") # Bypass if we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping a batch of transitions from memory player, loss = train_dqn(batch_size, player, memory, optimizer, device, target=target, gamma=gamma, clip_grad=clip_grad) # Update target net, if in double mode and time is right. if double and (episode % double_update == 0): target.load_state_dict(player.state_dict()) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") if debug or progress: print(f">>> loss {loss}") print(f">>> Q(last,a): {Q}") print(f">>> epsilon: {epsilon_e}") print(f">>> score: {score}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('loss', loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): sample_hat = np.asarray(create_board(a[0], a[1], m, n)) sample_hat = torch.from_numpy(sample_hat) sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float() values[i, :, :] = player(sample_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) max_values, _ = torch.max(values, 0) min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) writer.add_scalar('Q_min', torch.mean(min_values), episode) writer.add_scalar('Q_max', torch.mean(max_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot max plot_wythoff_board(max_values.numpy(), vmin=max_values.numpy().min(), vmax=max_values.numpy().max(), path=tensorboard, name='player_max_values.png') writer.add_image('max player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))), 0, dataformats='HWC') # Plot min plot_wythoff_board(min_values.numpy(), vmin=min_values.numpy().min(), vmax=min_values.numpy().max(), path=tensorboard, name='player_min_values.png') writer.add_image('min player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_min_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if monitor and save: save_monitored(save, monitored) if tensorboard: writer.close() result = {"player": player.state_dict(), "score": score} if target is not None: result['target'] = target.state_dict() if save: torch.save(result, save + ".pytorch") if monitor and not save: result["monitored"] = monitored if return_none: result = None return result
def wythoff_dqn1(epsilon=0.1, gamma=0.8, learning_rate=0.1, num_episodes=10, batch_size=100, memory_capacity=10000, game='Wythoff10x10', network='DQN', anneal=False, tensorboard=None, update_every=5, self_play=False, save=False, save_model=False, monitor=None, return_none=False, debug=False, progress=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) if network == 'DQN': player = DQN(m, n, num_actions=len(all_possible_moves)) opponent = DQN(m, n, num_actions=len(all_possible_moves)) elif network == 'DQN_mlp': player = DQN_mlp(m, n, num_actions=len(all_possible_moves)) opponent = DQN_mlp(m, n, num_actions=len(all_possible_moves)) else: raise ValueError("network must DQN or DQN_mlp") if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") player_memory = ReplayMemory(memory_capacity) opponent_memory = ReplayMemory(memory_capacity) if self_play: player_memory = opponent_memory player_optimizer = optim.Adam(player.parameters(), learning_rate) opponent_optimizer = optim.Adam(opponent.parameters(), learning_rate) moves = MoveCount(m, n) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init # # Scores steps = 1 done = False mover = 'opponent' # This will shift to player on the first move. transitions = [] # Worlds state = env.reset() x, y, board, available = state board = tuple(flatten_board(board)) moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game while not done: # Choose a mover mover = shift_mover(mover) memory = shift_memory(mover, player_memory, opponent_memory) model = shift_model(mover, player, opponent) # Convert board to a model(state) state_hat = torch.from_numpy(np.array(board).reshape(m, n)) state_hat = state_hat.unsqueeze(0).unsqueeze(1).float() # Get and filter Qs Qs = model(state_hat).float().detach() # torch Qs = Qs.numpy().squeeze() mask = build_mask(available, m, n).flatten() Qs *= mask # Choose a move index = np.nonzero(mask)[0].tolist() move_i = e_greedy(Qs, epsilon=epsilon_e, index=index, mode='numpy') # Re-index move_i to match 'available' index move_a = index.index(move_i) move = available[move_a] # Analyze it... if move in locate_cold_moves(x, y, available): score += (1 - score) / episode # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next total_reward += reward # Save transitions, as tensors to be used at training time moves.update(move) state_hat_next = torch.from_numpy( np.array(board_next).reshape(m, n)) state_hat_next = state_hat_next.unsqueeze(0).unsqueeze(1).float() transitions.append([ state_hat.float(), torch.from_numpy(mask), torch.tensor(move_i), state_hat_next.float(), torch.tensor([reward]).unsqueeze(0).float() ]) # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # - if debug: print(f">>> {mover}: {move}") print(f">>> new position: ({x_next}, {y_next})") # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][4] = transitions[-1][4] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions), 2): s, x, a, sn, r = transitions[i] player_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) for i in range(1, len(transitions), 2): s, x, a, sn, r = transitions[i] opponent_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) # Bypass is we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping batches of transitions from memory player, player_loss = train_dqn(batch_size, player, player_memory, player_optimizer, device, gamma=gamma) opponent, opponent_loss = train_dqn(batch_size, opponent, opponent_memory, opponent_optimizer, device, gamma=gamma) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") print(f">>> winner: {mover}") if debug or progress: print(f">>> Q: {Qs}") print(f">>> max(Q): {Qs.max()}") print(f">>> min(Q): {Qs.min()}") print(f">>> stdev(Q): {Qs.std()}") print( f">>> loss (player: {player_loss}, opponent: {opponent_loss})") print(f">>> player score: {score}") print(f">>> epsilon: {epsilon_e}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('player_loss', player_loss, episode) writer.add_scalar('opponent_loss', opponent_loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): example = create_board(a[0], a[1], m, n) values[i, :, :] = player(state_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) # max_values, _ = torch.max(values, 0) # min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': player, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (player, opponent), (score / episode, total_reward) if return_none: result = None return result
def evaluate_dqn2(path, game='Wythoff15x15', num_episodes=100, opponent='self', model_name="player", network='DQN_xy2', monitor=None, save=None, debug=False, return_none=False, seed=None): """Evaulate transfer on frozen DQN model.""" # ------------------------------------------------------------------------ # Arg sanity num_episodes = int(num_episodes) opponents = ('self', 'random', 'optimal', 'efficient') if opponent not in opponents: raise ValueError(f"opponent must be {opponents}") # Logs if monitor is not None: monitored = create_monitored(monitor) # Init total_reward = 0 opts = OptimalCount(0) # Env env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) # Load the final model to evaluate result = torch.load(path, map_location=torch.device('cpu')) state_dict = result[model_name] # Is network a nn.Module? if hasattr(network, "forward"): Model = network # Is it the name of a azad model? else: Model = getattr(azad.models, network) model = Model().to("cpu") model.load_state_dict(state_dict) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Random player moves first player = 0 # Init this game state = env.reset() x, y, board, available = state if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # --- # Play a game done = False steps = 0 score = 0 while not done: # Optimal moves colds = locate_cold_moves(x, y, available) # Build value array Qs = build_Qs(model, state, available, device="cpu", mode="numpy") # Choose a move, based on the player code and the opponent type. # Player 0 is always the final model we are evaluating. if player == 0: move_i = e_greedy(Qs, epsilon=0, mode='numpy') move = available[move_i] elif (player == 1) and (opponent == 'self'): move_i = e_greedy(Qs, epsilon=0, mode='numpy') move = available[move_i] elif (player == 1) and (opponent == 'random'): move = random.choice(available) elif (player == 1) and (opponent == 'optimal'): if len(colds) > 0: move = random.choice(colds) else: move = random.choice(available) elif (player == 1) and (opponent == 'efficient'): if len(colds) > 0: distances = [sum(c) for c in colds] move_i = np.argmin(distances) move = colds[move_i] else: move = random.choice(available) # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next # Analyze it if it was the player best = 0.0 if (player == 0) and cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # - if debug: print(f">>> available: {available}") print(f">>> Qs: {Qs}") print(f">>> new position: ({x_next}, {y_next})") # Shift for next play? if not done: # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) player = shift_player(player) steps += 1 # Tabulate wins, and totals (p1) if done: winner = player if player == 0 and done: total_reward += reward # --- # Save into monitored after each game if monitor: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # --- # Tournament over.... # Build a result, focused on player result = {} result["total_reward"] = total_reward # Save? The end. if monitor and save is not None: save_monitored(save, monitored) if monitor and not save: result["monitored"] = monitored if return_none: return None else: return result
def wythoff_stumbler(num_episodes=10, epsilon=0.1, gamma=0.8, learning_rate=0.1, game='Wythoff10x10', model=None, opponent=None, anneal=False, bias_board=None, influence=0.0, score=0.0, total_reward=0.0, tensorboard=None, update_every=5, initial=0, self_play=False, save=False, load_model=None, save_model=False, monitor=None, return_none=False, debug=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init env if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) # Create env if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) if monitor is not None: monitored = create_monitored(monitor) # ------------------------------------------------------------------------ # Init Agents default_Q = 0.0 m, n, board, available = peek(env) if model is None: model = {} if opponent is None: opponent = {} # Override from file? if load_model is not None: if debug: print(">>> Loadiing model/opponent from {}".format(load_model)) model, opponent = load_stumbler(model, opponent, load_model) # ------------------------------------------------------------------------ for episode in range(initial, initial + num_episodes): # Re-init steps = 1 x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW GAME ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) print(">>> Initial moves {}".format(available)) print("---------------------------------------") t_state = [ board, ] t_available = [available] t_move = [] t_move_i = [] t_reward = [] # ------------------------------------------------------------------- # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = episode # ------------------------------------------------------------------- # Play! done = False player_win = False while not done: # PLAYER CHOOSES A MOVE try: Qs_episode = add_bias_board(model[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: model[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> PLAYER move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: player_win = True t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- if not done: # OPPONENT CHOOSES A MOVE try: Qs_episode = add_bias_board(opponent[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: opponent[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> OPPONENT move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- # Learn by unrolling the last game... # PLAYER (model) s_idx = np.arange(0, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = model[s][m_i] try: max_Q = model[next_s].max() except KeyError: model[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = model[next_s].max() if player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Update running reward total for player total_reward += r # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q model[s][m_i] = Q + (learning_rate * loss) # OPPONENT s_idx = np.arange(1, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = opponent[s][m_i] try: max_Q = opponent[next_s].max() except KeyError: opponent[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = opponent[next_s].max() if not player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q opponent[s][m_i] = Q + (learning_rate * loss) # ---------------------------------------------------------------- # Update the log if debug: print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format( r, Q, next_Q, loss)) if done and (r > 0): print("*** WIN ***") if done and (r < 0): print("*** OPPONENT WIN ***") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', r, episode) writer.add_scalar('Q', Q, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('stumber_error', loss, episode) writer.add_scalar('stumber_steps', steps, episode) writer.add_scalar('stumbler_score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board( cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image( 'cold_positions', skimage.io.imread(os.path.join(tensorboard, 'cold_board.png'))) # Agent max(Q) boards values = expected_value(m, n, model) plot_wythoff_board( values, path=tensorboard, name='player_max_values.png') writer.add_image( 'player', skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))) values = expected_value(m, n, opponent) plot_wythoff_board( values, path=tensorboard, name='opponent_max_values.png') writer.add_image( 'opponent', skimage.io.imread( os.path.join(tensorboard, 'opponent_max_values.png'))) if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': model, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (model, opponent), (score, total_reward) if return_none: result = None return result
def evaluate_wythoff(stumbler=None, strategist=None, stumbler_game='Wythoff10x10', strategist_game='Wythoff50x50', random_stumbler=False, load_model=None, save=None, return_none=False, num_episodes=100, debug=False): """Compare stumblers to strategists. Returns ------- wins : float the fraction of games won by the strategist. """ # ------------------------------------------------------------------------ if load_model is not None: stumbler, _, strategist = load_for_eval(load_model) # Init boards, etc # Stratgist env = create_env(strategist_game, monitor=False) m, n, board, _ = peek(env) if strategist is not None: hot_cold_table = create_bias_board(m, n, strategist) else: hot_cold_table = np.zeros_like(board) # Stumbler o, p, _, _ = peek(create_env(stumbler_game, monitor=False)) # ------------------------------------------------------------------------ # A stumbler and a strategist take turns playing a (m,n) game of wythoffs wins = 0.0 strategist_score = 0.0 stumbler_score = 0.0 for episode in range(num_episodes): # Re-init steps = 0 # Start the game, and process the result x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW MODEL EVALUATION ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) done = False while not done: # ---------------------------------------------------------------- # STUMBLER if (x < o) and (y < p): s_board = tuple(flatten_board(create_board(x, y, o, p))) s_available = create_moves(x, y) try: values = stumbler[s_board] move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy') move = s_available[move_i] except KeyError: move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] else: s_available = available move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] # ---------------------------------------------------------------- # RANDOM PLAYER if random_stumbler: move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze the choice best = 0.0 if cold_move_available(x, y, s_available): if move in locate_cold_moves(x, y, s_available): best = 1.0 stumbler_score += (best - stumbler_score) / (episode + 1) # Move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if debug: print(">>> STUMBLER move {}".format(move)) if done: break # ---------------------------------------------------------------- # STRATEGIST # Choose. hot_cold_move_values = [hot_cold_table[i, j] for i, j in available] move_i = epsilon_greedy( np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy') move = available[move_i] if debug: print(">>> STRATEGIST move {}".format(move)) # Analyze the choice best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 strategist_score += (best - strategist_score) / (episode + 1) # Make a move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if done: wins += 1.0 break if debug: print("Wins {}, Scores ({}, {})".format(wins, stumbler_score, strategist_score)) if save is not None: np.savetxt( save, np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3), fmt='%.1f,%.4f,%.4f', comments="", header="wins,stumbler_score,strategist_score") result = (wins / num_episodes), stumbler_score, strategist_score if return_none: result = None return result
def wythoff_mcts(num_episodes=10, num_simulations=10, c=1.41, game='Wythoff10x10', device="cpu", tensorboard=None, update_every=5, monitor=None, use_history=False, save=None, debug=False, seed=None): """Learn to play Wythoff's, using MCTS.""" # ------------------------------------------------------------------------ # Logs... if tensorboard is not None: raise NotImplementedError() # if tensorboard is not None: # try: # os.makedirs(tensorboard) # except OSError as exception: # if exception.errno != errno.EEXIST: # raise # writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init num_episodes = int(num_episodes) num_simulations = int(num_simulations) score = 0 m, n = env.m, env.n moves = MoveCount(m, n) opts = OptimalCount(0) history = HistoryMCTS() mcts = None if debug: print(f">>> Setting up....") print(f">>> Device: {device}") # ------------------------------------------------------------------------ # Train! for episode in range(num_episodes): # Choose player 0 or 1 to start player = int(np.random.binomial(1, 0.5)) # Restart the world state = env.reset() x, y, board, available = state moves.update((x, y)) # Restart vars # player = 0 winner = None done = False # The root should eventuall be linked to all possible starting # configurations. if debug: print("---") print(f">>> New game {episode} - ({env.x},{env.y})") # -------------------------------------------------------------------- # Play a game. step = 0 while not done: # Use MCTS to choose a move mcts = None if use_history and ((x, y) in history): mcts = history.get((x, y)) if debug: print(f">>> {step}. using mcts history") # mcts = history.get((x, y)) move, mcts = run_mcts(player, env, num_simulations=num_simulations, c=c, default_policy=random_policy, mcts=mcts) # Play it. state, reward, done, info = env.step(move) # Analyze it. colds = locate_cold_moves(x, y, available) if len(colds) > 0: if move in colds: opts.increase() else: opts.decrease() score = opts.score() # - if debug: print(f">>> {step}. player: {player}") print(f">>> {step}. moves: {available}") print(f">>> {step}. cold moves: {colds}") print(f">>> {step}. move: ({move})") print(f">>> {step}. score: {score}") # Log history (only imporved models get stored in the history) history.add((x, y), score, mcts) moves.update((x, y)) step += 1 # Shift state for next iterations x, y, board, available = state player = shift_player(player) # -------------------------------------------------------------------- # Log results if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # ------------------------------------------------------------------------ if monitor: save_monitored(save, monitored) result = dict(mcts=history, score=score) if save is not None: save_checkpoint(result, filename=save + ".pkl") else: return result
def wythoff_alphazero(num_episodes=10, batch_size=100, c=1.41, game='Wythoff15x15', learning_rate=1e-3, device="cpu", network_type='ResNet', max_size=15, tensorboard=None, update_every=5, monitor=None, use_history=False, save=None, debug=False, seed=None): """Learn to play Wythoff's, using MCTS.""" # ------------------------------------------------------------------------ # Logs... if tensorboard is not None: raise NotImplementedError() # if tensorboard is not None: # try: # os.makedirs(tensorboard) # except OSError as exception: # if exception.errno != errno.EEXIST: # raise # writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init num_episodes = int(num_episodes) score = 0 # Play logs m, n = env.m, env.n moves = MoveCount(m, n) opts = OptimalCount(0) history = HistoryMCTS() # Network learning memory = ReplayMemory(1e3) print(device) if network_type == 'ResNet': network = ResNet(board_size=max_size).to(device) elif network_type == 'MLP': network = MLP(board_size=max_size).to(device) else: raise ValueError("Argument network_type must be ResNet or MLP") optimizer = optim.Adam(network.parameters(), lr=learning_rate) if debug: print(f">>> Setting up....") print(f">>> Device: {device}") # ------------------------------------------------------------------------ # Train! for episode in range(num_episodes): # Choose player 0 or 1 to start player = int(np.random.binomial(1, 0.5)) # Restart the world state = env.reset() x, y, board, available = state moves.update((x, y)) if debug: print("---") print(f">>> Game {episode} - ({env.x},{env.y})") # -------------------------------------------------------------------- # Play a game. done = False step = 0 while not done: move, mcts = run_alphazero(player, env, network, c=c, default_policy=random_policy, device=device) # Play it. state, reward, done, info = env.step(move) # Analyze it. colds = locate_cold_moves(x, y, available) if len(colds) > 0: if move in colds: opts.increase() else: opts.decrease() score = opts.score() # - if debug: print(f">>> {step}. player: {player}") print(f">>> {step}. moves: {available}") print(f">>> {step}. cold moves: {colds}") print(f">>> {step}. move: ({move})") print(f">>> {step}. score: {score}") # Log history (only imporved models get stored in the history) history.add((x, y), score, mcts) moves.update((x, y)) step += 1 # Update the memory N = sum([node.count for node in mcts.root.children]) loc = mcts.root.child_names.index(move) node = mcts.root.children[loc] memory.push( torch.tensor([network.all_moves.index(move)], device=device).unsqueeze(0).int(), torch.tensor(create_board(x, y, m, n), device=device).float(), torch.tensor([node.count / N], device=device).unsqueeze(0), torch.tensor([node.value / node.count], device=device).unsqueeze(0), ) # Shift state for next iterations x, y, board, available = state player = shift_player(player) # -------------------------------------------------------------------- # train the value resnet network, loss = train_resnet(network, memory, optimizer, batch_size=batch_size, clip_grad=False) if debug: print(f">>> Traning the resnet. Loss: {loss}") # -------------------------------------------------------------------- # Log results if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # ------------------------------------------------------------------------ if monitor: save_monitored(save, monitored) result = dict(mcts=history, network=network, loss=loss, score=score) if save is not None: save_checkpoint(result, filename=save + ".pkl") else: return result