def __init__(self, m, n, hot_value=-1, cold_value=1): self.m = int(m) self.n = int(m) self.hot_value = float(hot_value) self.cold_value = float(cold_value) self.board = create_cold_board( self.m, self.n, cold_value=cold_value, default=hot_value)
def wythoff_dqn1(epsilon=0.1, gamma=0.8, learning_rate=0.1, num_episodes=10, batch_size=100, memory_capacity=10000, game='Wythoff10x10', network='DQN', anneal=False, tensorboard=None, update_every=5, self_play=False, save=False, save_model=False, monitor=None, return_none=False, debug=False, progress=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) if network == 'DQN': player = DQN(m, n, num_actions=len(all_possible_moves)) opponent = DQN(m, n, num_actions=len(all_possible_moves)) elif network == 'DQN_mlp': player = DQN_mlp(m, n, num_actions=len(all_possible_moves)) opponent = DQN_mlp(m, n, num_actions=len(all_possible_moves)) else: raise ValueError("network must DQN or DQN_mlp") if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") player_memory = ReplayMemory(memory_capacity) opponent_memory = ReplayMemory(memory_capacity) if self_play: player_memory = opponent_memory player_optimizer = optim.Adam(player.parameters(), learning_rate) opponent_optimizer = optim.Adam(opponent.parameters(), learning_rate) moves = MoveCount(m, n) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init # # Scores steps = 1 done = False mover = 'opponent' # This will shift to player on the first move. transitions = [] # Worlds state = env.reset() x, y, board, available = state board = tuple(flatten_board(board)) moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game while not done: # Choose a mover mover = shift_mover(mover) memory = shift_memory(mover, player_memory, opponent_memory) model = shift_model(mover, player, opponent) # Convert board to a model(state) state_hat = torch.from_numpy(np.array(board).reshape(m, n)) state_hat = state_hat.unsqueeze(0).unsqueeze(1).float() # Get and filter Qs Qs = model(state_hat).float().detach() # torch Qs = Qs.numpy().squeeze() mask = build_mask(available, m, n).flatten() Qs *= mask # Choose a move index = np.nonzero(mask)[0].tolist() move_i = e_greedy(Qs, epsilon=epsilon_e, index=index, mode='numpy') # Re-index move_i to match 'available' index move_a = index.index(move_i) move = available[move_a] # Analyze it... if move in locate_cold_moves(x, y, available): score += (1 - score) / episode # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next total_reward += reward # Save transitions, as tensors to be used at training time moves.update(move) state_hat_next = torch.from_numpy( np.array(board_next).reshape(m, n)) state_hat_next = state_hat_next.unsqueeze(0).unsqueeze(1).float() transitions.append([ state_hat.float(), torch.from_numpy(mask), torch.tensor(move_i), state_hat_next.float(), torch.tensor([reward]).unsqueeze(0).float() ]) # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # - if debug: print(f">>> {mover}: {move}") print(f">>> new position: ({x_next}, {y_next})") # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][4] = transitions[-1][4] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions), 2): s, x, a, sn, r = transitions[i] player_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) for i in range(1, len(transitions), 2): s, x, a, sn, r = transitions[i] opponent_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) # Bypass is we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping batches of transitions from memory player, player_loss = train_dqn(batch_size, player, player_memory, player_optimizer, device, gamma=gamma) opponent, opponent_loss = train_dqn(batch_size, opponent, opponent_memory, opponent_optimizer, device, gamma=gamma) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") print(f">>> winner: {mover}") if debug or progress: print(f">>> Q: {Qs}") print(f">>> max(Q): {Qs.max()}") print(f">>> min(Q): {Qs.min()}") print(f">>> stdev(Q): {Qs.std()}") print( f">>> loss (player: {player_loss}, opponent: {opponent_loss})") print(f">>> player score: {score}") print(f">>> epsilon: {epsilon_e}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('player_loss', player_loss, episode) writer.add_scalar('opponent_loss', opponent_loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): example = create_board(a[0], a[1], m, n) values[i, :, :] = player(state_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) # max_values, _ = torch.max(values, 0) # min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': player, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (player, opponent), (score / episode, total_reward) if return_none: result = None return result
def wythoff_dqn2(epsilon=0.1, gamma=0.5, learning_rate=1e-6, num_episodes=100, batch_size=20, memory_capacity=100, game='Wythoff10x10', network='DQN_xy1', anneal=False, tensorboard=None, update_every=5, double=False, double_update=10, save=False, save_model=False, monitor=None, return_none=False, debug=False, device='cpu', clip_grad=False, progress=False, zero=False, seed=None): """Learning Wythoff's, with a DQN.""" # ------------------------------------------------------------------------ # Init num_episodes = int(num_episodes) batch_size = int(batch_size) memory_capacity = int(memory_capacity) update_every = int(update_every) # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) # Is network a nn.Module? if hasattr(network, "forward"): Model = network # Is it the name of a azad model? else: Model = getattr(azad.models, network) player = Model().to(device) target = Model().to(device) if double: target.load_state_dict(player.state_dict()) target.eval() else: target = None if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Device: {device}") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") memory = ReplayMemory(memory_capacity) # optimizer = optim.Adam(player.parameters(), learning_rate) optimizer = optim.SGD(player.parameters(), learning_rate) moves = MoveCount(m, n) opts = OptimalCount(0) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init transitions = [] state = env.reset() x, y, board, available = state moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game steps = 1 done = False while not done: # Choose a move Qs = build_Qs(player, state, available, device=device, mode="numpy") move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy') move = available[move_i] moves.update(move) # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next # Track value statistics total_reward += reward Q = Qs[move_i] prediction_error = Qs.max() - Q advantage = Q - Qs[np.nonzero(Qs)].mean() # Save transitions, as tensors to be used at training time # (onto GPU) transitions.append([ # S torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(), # A torch.tensor(move).unsqueeze(0), # S' torch.tensor( (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(), # R torch.tensor([reward]).unsqueeze(0).float(), ]) # - if debug: print(f">>> position: {(x, y)}") print(f">>> num available: {len(available)}") print(f">>> available: {available}") print(f">>> Qs (filtered): {Qs}") print(f">>> new position: ({x_next}, {y_next})") # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][3] = transitions[-1][3] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions)): memory.push(*transitions[i]) if debug: print(f">>> final transitions: {transitions[-2:]}") # Bypass if we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping a batch of transitions from memory player, loss = train_dqn(batch_size, player, memory, optimizer, device, target=target, gamma=gamma, clip_grad=clip_grad) # Update target net, if in double mode and time is right. if double and (episode % double_update == 0): target.load_state_dict(player.state_dict()) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") if debug or progress: print(f">>> loss {loss}") print(f">>> Q(last,a): {Q}") print(f">>> epsilon: {epsilon_e}") print(f">>> score: {score}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('loss', loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): sample_hat = np.asarray(create_board(a[0], a[1], m, n)) sample_hat = torch.from_numpy(sample_hat) sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float() values[i, :, :] = player(sample_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) max_values, _ = torch.max(values, 0) min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) writer.add_scalar('Q_min', torch.mean(min_values), episode) writer.add_scalar('Q_max', torch.mean(max_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot max plot_wythoff_board(max_values.numpy(), vmin=max_values.numpy().min(), vmax=max_values.numpy().max(), path=tensorboard, name='player_max_values.png') writer.add_image('max player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))), 0, dataformats='HWC') # Plot min plot_wythoff_board(min_values.numpy(), vmin=min_values.numpy().min(), vmax=min_values.numpy().max(), path=tensorboard, name='player_min_values.png') writer.add_image('min player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_min_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if monitor and save: save_monitored(save, monitored) if tensorboard: writer.close() result = {"player": player.state_dict(), "score": score} if target is not None: result['target'] = target.state_dict() if save: torch.save(result, save + ".pytorch") if monitor and not save: result["monitored"] = monitored if return_none: result = None return result
def wythoff_oracular_strategy(num_episodes=1000, learning_rate=0.025, num_hidden1=100, num_hidden2=25, stumbler_game='Wythoff10x10', strategist_game='Wythoff50x50', tensorboard=None, update_every=50, save=None, return_none=False, debug=False, seed=None): """Train a strategist layer on perfact data.""" # ------------------------------------------------------------------------ # Setup if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) # Boards, etc m, n, board, _ = peek(create_env(strategist_game)) o, p, _, _ = peek(create_env(stumbler_game)) if debug: print(">>> TRANING AN OPTIMAL STRATEGIST.") print(">>> Train board {}".format(o, p)) print(">>> Test board {}".format(m, n)) # Seeding... np.random.seed(seed) # Train params strategic_default_value = 0.0 batch_size = 64 # ------------------------------------------------------------------------ # Build a Strategist, its memory, and its optimizer # Create a model, of the right size. # model = HotCold2(2, num_hidden1=num_hidden1) model = HotCold3(2, num_hidden1=num_hidden1, num_hidden2=num_hidden2) optimizer = optim.Adam(model.parameters(), lr=learning_rate) memory = ReplayMemory(10000) # Run learning episodes. The 'stumbler' is just the opt # cold board for episode in range(num_episodes): # The cold spots are '1' everythig else is '0' strategic_value = create_cold_board(o, p) # ...Into tuples s_data = convert_ijv(strategic_value) s_data = balance_ijv(s_data, strategic_default_value) for d in s_data: memory.push(*d) loss = 0.0 if len(memory) > batch_size: # Sample data.... coords = [] values = [] samples = memory.sample(batch_size) for c, v in samples: coords.append(c) values.append(v) coords = torch.tensor( np.vstack(coords), requires_grad=True, dtype=torch.float) values = torch.tensor( values, requires_grad=False, dtype=torch.float) # Making some preditions, predicted_values = model(coords).squeeze() # and find their loss. loss = F.mse_loss(predicted_values, values) # Walk down the hill of righteousness! optimizer.zero_grad() loss.backward() optimizer.step() if debug: print(">>> Coords: {}".format(coords)) print(">>> Values: {}".format(values)) print(">>> Predicted values: {}".format(values)) print(">>> Loss {}".format(loss)) # Use the trained strategist to generate a bias_board, bias_board = create_bias_board(m, n, model) if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar( os.path.join(tensorboard, 'error'), loss, episode) plot_wythoff_board( strategic_value, vmin=0, vmax=1, path=tensorboard, name='strategy_board_{}.png'.format(episode)) writer.add_image( 'Training board', skimage.io.imread( os.path.join(tensorboard, 'strategy_board_{}.png'.format(episode)))) plot_wythoff_board( bias_board, vmin=0, vmax=1, path=tensorboard, name='bias_board_{}.png'.format(episode)) writer.add_image( 'Testing board', skimage.io.imread( os.path.join(tensorboard, 'bias_board_{}.png'.format(episode)))) # The end if tensorboard: writer.close() # Suppress return for parallel runs? result = (model), (loss) if return_none: result = None return result
def wythoff_strategist(stumbler_model, stumbler_game, num_episodes=1000, cold_threshold=0.0, hot_threshold=0.5, hot_value=1, cold_value=-1, learning_rate=0.01, game='Wythoff50x50', model=None, num_hidden1=100, num_hidden2=25, initial=0, score=0.0, tensorboard=None, stumbler_mode='numpy', balance_cold=False, reflect_cold=True, update_every=50, save=None, load_model=None, save_model=False, monitor=None, return_none=False, debug=False, heuristic=True, seed=None): """Learn a generalizable strategy for Wythoffs game""" # ------------------------------------------------------------------------ # Setup if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) # Create env and find all moves in it # Create env if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) o, p, _, _ = peek(create_env(stumbler_game, monitor=False)) m, n, board, _ = peek(env) all_possible_moves = create_all_possible_moves(m, n) # Watch vars? if monitor: monitored = create_monitored(monitor) # Init strategist if model is None: model = init_strategist(num_hidden1, num_hidden2) # Add old weights from file? if load_model is not None: if debug: print(">>> Loading model from {}".format(load_model)) model = load_strategist(model, load_model) # Init SGD. optimizer = optim.Adam(model.parameters(), lr=learning_rate) # ------------------------------------------------------------------------ # Extract strategic data from the stumbler strategic_default_value = 0.0 if heuristic: if hot_threshold is None: strategic_value = estimate_cold( m, n, stumbler_model, threshold=cold_threshold, value=cold_value, reflect=reflect_cold, default_value=strategic_default_value) elif cold_threshold is None: strategic_value = estimate_hot( m, n, stumbler_model, threshold=hot_threshold, value=hot_value, default_value=strategic_default_value) else: strategic_value = estimate_hot_cold( o, p, stumbler_model, hot_threshold=hot_threshold, cold_threshold=cold_threshold, hot_value=hot_value, cold_value=cold_value, reflect_cold=reflect_cold, default_value=strategic_default_value) else: strategic_value = expected_value( o, p, stumbler_model, default_value=strategic_default_value) # Convert format. s_data = convert_ijv(strategic_value) if balance_cold: s_data = balance_ijv(s_data, cold_value) # Sanity? if s_data is None: return model, None # Define a memory to sample. memory = ReplayMemory(len(s_data)) batch_size = len(s_data) for d in s_data: memory.push(*d) # ------------------------------------------------------------------------ # Sample the memory to teach the strategist bias_board = None for episode in range(initial, initial + num_episodes): loss = 0.0 if debug: print("---------------------------------------") print(">>> STRATEGIST ({}).".format(episode)) coords = [] values = [] for c, v in memory.sample(batch_size): coords.append(c) values.append(v) coords = torch.tensor( np.vstack(coords), requires_grad=True, dtype=torch.float) values = torch.tensor(values, requires_grad=False, dtype=torch.float) # Making some preditions, ... predicted_values = model(coords).squeeze() # and learn from them loss = F.mse_loss(predicted_values, values) optimizer.zero_grad() loss.backward() optimizer.step() # -------------------------------------------------------------------- if debug: print(">>> Coords: {}".format(coords)) print(">>> Values: {}".format(values)) print(">>> Predicted values: {}".format(values)) print(">>> Loss {}".format(loss)) if tensorboard and (int(episode) % update_every) == 0: # Timecourse writer.add_scalar('stategist_error', loss, episode) bias_board = create_bias_board(m, n, model) plot_wythoff_board( bias_board, vmin=-1.5, vmax=1.5, path=tensorboard, height=10, width=15, name='bias_board.png') writer.add_image( 'strategist', skimage.io.imread(os.path.join(tensorboard, 'bias_board.png'))) if monitor and (int(episode) % update_every) == 0: # Score the model: with th.no_grad(): pred = create_bias_board(m, n, model, default=0.0).numpy() cold = create_cold_board(m, n, default=hot_value) mae = np.median(np.abs(pred - cold)) all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # Final score for the model: with th.no_grad(): pred = create_bias_board(m, n, model, default=0.0).numpy() cold = create_cold_board(m, n, default=hot_value) mae = np.median(np.abs(pred - cold)) # Save? if save_model: state = { 'strategist_model_dict': model.state_dict(), "num_hidden1": num_hidden1, "num_hidden2": num_hidden2 } th.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) # Suppress return for parallel runs? result = (model), (mae) if return_none: result = None return result
def wythoff_stumbler(num_episodes=10, epsilon=0.1, gamma=0.8, learning_rate=0.1, game='Wythoff10x10', model=None, opponent=None, anneal=False, bias_board=None, influence=0.0, score=0.0, total_reward=0.0, tensorboard=None, update_every=5, initial=0, self_play=False, save=False, load_model=None, save_model=False, monitor=None, return_none=False, debug=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init env if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) # Create env if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) if monitor is not None: monitored = create_monitored(monitor) # ------------------------------------------------------------------------ # Init Agents default_Q = 0.0 m, n, board, available = peek(env) if model is None: model = {} if opponent is None: opponent = {} # Override from file? if load_model is not None: if debug: print(">>> Loadiing model/opponent from {}".format(load_model)) model, opponent = load_stumbler(model, opponent, load_model) # ------------------------------------------------------------------------ for episode in range(initial, initial + num_episodes): # Re-init steps = 1 x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW GAME ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) print(">>> Initial moves {}".format(available)) print("---------------------------------------") t_state = [ board, ] t_available = [available] t_move = [] t_move_i = [] t_reward = [] # ------------------------------------------------------------------- # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = episode # ------------------------------------------------------------------- # Play! done = False player_win = False while not done: # PLAYER CHOOSES A MOVE try: Qs_episode = add_bias_board(model[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: model[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> PLAYER move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: player_win = True t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- if not done: # OPPONENT CHOOSES A MOVE try: Qs_episode = add_bias_board(opponent[board], available, bias_board, influence) move_i = epsilon_greedy( Qs_episode, epsilon=epsilon_e, mode='numpy') except KeyError: opponent[board] = np.ones(len(available)) * default_Q move_i = np.random.randint(0, len(available)) move = available[move_i] # PLAY THE MOVE (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) steps += 1 # Log.... if debug: print(">>> OPPONENT move {}".format(move)) t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) if done: t_state.append(board) t_move.append(move) t_available.append(available) t_move_i.append(move_i) t_reward.append(reward) # ---------------------------------------------------------------- # Learn by unrolling the last game... # PLAYER (model) s_idx = np.arange(0, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = model[s][m_i] try: max_Q = model[next_s].max() except KeyError: model[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = model[next_s].max() if player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Update running reward total for player total_reward += r # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q model[s][m_i] = Q + (learning_rate * loss) # OPPONENT s_idx = np.arange(1, steps - 1, 2) for i in s_idx: # States and actions s = t_state[i] next_s = t_state[i + 2] m_i = t_move_i[i] # Value and reward Q = opponent[s][m_i] try: max_Q = opponent[next_s].max() except KeyError: opponent[next_s] = np.ones(len(t_available[i])) * default_Q max_Q = opponent[next_s].max() if not player_win: r = t_reward[i] else: r = -1 * t_reward[i + 1] # Loss and learn next_Q = r + (gamma * max_Q) loss = next_Q - Q opponent[s][m_i] = Q + (learning_rate * loss) # ---------------------------------------------------------------- # Update the log if debug: print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format( r, Q, next_Q, loss)) if done and (r > 0): print("*** WIN ***") if done and (r < 0): print("*** OPPONENT WIN ***") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', r, episode) writer.add_scalar('Q', Q, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('stumber_error', loss, episode) writer.add_scalar('stumber_steps', steps, episode) writer.add_scalar('stumbler_score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board( cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image( 'cold_positions', skimage.io.imread(os.path.join(tensorboard, 'cold_board.png'))) # Agent max(Q) boards values = expected_value(m, n, model) plot_wythoff_board( values, path=tensorboard, name='player_max_values.png') writer.add_image( 'player', skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))) values = expected_value(m, n, opponent) plot_wythoff_board( values, path=tensorboard, name='opponent_max_values.png') writer.add_image( 'opponent', skimage.io.imread( os.path.join(tensorboard, 'opponent_max_values.png'))) if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': model, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (model, opponent), (score, total_reward) if return_none: result = None return result