def muzero(config: MuZeroConfig): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer()) replay_buffer = ReplayBuffer(config) for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) print("Eval score:", run_eval(config, storage, NUM_EVAL_EPISODES)) print( f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n" ) return storage.latest_network()
def multiprocess_play_game_helper(config: MuZeroConfig, initial: bool, train: bool, result_queue: Queue = None, sema=None): sema.acquire() # Prevent child processes from overallocating GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" pretrained = True if initial: if config.load_directory is not None: # User specified directory to load network from network = config.old_network(config.load_directory) else: network = config.old_network('blank_network') pretrained = False else: network = config.old_network('checkpoint') storage = SharedStorage(network=network, uniform_network=config.uniform_network(), optimizer=config.new_optimizer(), save_directory=config.save_directory, config=config, pretrained=pretrained) play_game(config=config, storage=storage, train=train, visual=False, queue=result_queue) sema.release()
def muzero(config: MuZeroConfig): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ network = config.new_network() storage = SharedStorage(network, config.uniform_network(), config.new_optimizer(network)) replay_buffer = ReplayBuffer(config) train_scores = [] eval_scores = [] train_losses = [] for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_losses += train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) score_eval = run_eval(config, storage, 50) print("Eval score:", score_eval) print( f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n" ) train_scores.append(score_train) eval_scores.append(score_eval) plt.figure(1) plt.plot(train_scores) plt.plot(eval_scores) plt.title('MuZero Average Rewards') plt.xlabel('MuZero Iterations (Train/Eval)') plt.ylabel('Reward Score') plt.legend(['Train score', 'Eval score']) plt.figure(2) plt.plot(train_losses, color='green') plt.title('MuZero Training Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.show() return storage.latest_network()
def train_network_helper(config: MuZeroConfig, replay_buffer: ReplayBuffer, epochs: int): try: network = config.old_network('checkpoint') optimizer = config.old_optimizer('checkpoint') print('Loaded optimizer') except FileNotFoundError: print('No checkpoint. Loading blank') network = config.old_network('blank_network') optimizer = config.new_optimizer() for _ in range(epochs): print('Epoch {}'.format(_), end='\r') batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps) update_weights(optimizer, network, batch) SharedStorage.save_network_to_disk(network, config, optimizer)
def make_cartpole_config(): def visit_softmax_temperature(num_moves, training_steps): return 1.0 return MuZeroConfig(game=CartPole, action_space_size=2, max_moves=500, discount=0.997, dirichlet_alpha=0.25, num_simulations=50, num_training_loop=50, num_epochs=5000, batch_size=128, td_steps=50, num_train_episodes=20, num_eval_episodes=1, lr_init=0.05, lr_decay_steps=1000, max_priority=False, visit_softmax_temperature_fn=visit_softmax_temperature, network_args={ 'support_size': 10, 'encoding_size': 8, 'rep_hidden': [], 'dyn_hidden': [16], 'rew_hidden': [16], 'val_hidden': [], 'pol_hidden': [], 'observation_shape': (1, 1, 4), }, result_path="cartpole.weights")
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame: """ Each game is produced by starting at the initial board position, then repeatedly executing a Monte Carlo Tree Search to generate moves until the end of the game is reached. """ game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the networks. run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) return game
def muzero(config: MuZeroConfig): storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer()) replay_buffer = ReplayBuffer(config) for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) print("Eval score:", run_eval(config, storage, 50)) print(f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n") storage.save_network_dir(config.nb_training_loop) return storage.latest_network()
def make_config() -> MuZeroConfig: game_config = GameConfig(name='TicTacToe', environment_class=TicTacToeEnvironment, environment_parameters={}, action_space_size=9, num_players=2, discount=1.0 ) replay_buffer_config = ReplayBufferConfig(window_size=int(1e4), prefetch_buffer_size=10 ) mcts_config = MCTSConfig(max_moves=9, root_dirichlet_alpha=1.0, root_exploration_fraction=0.25, num_simulations=20, temperature=1.0, freezing_moves=9, default_value=Value(0.0) ) network_config = NetworkConfig(network_class=TicTacToeNetwork, regularizer=tf.keras.regularizers.l2(l=1e-4), hidden_state_size=128, hidden_size=128 ) training_config = TrainingConfig(optimizer=tf.keras.optimizers.Adam(), batch_size=128, training_steps=int(5e4), checkpoint_interval=int(5e2), replay_buffer_loginterval=50, num_unroll_steps=2, td_steps=9, steps_per_execution=1 ) reward_config = ScalarConfig(known_bounds=KnownBounds(minv=Value(0.0), maxv=Value(1.0)), support_size=None, loss_decay=0.0) value_config = ScalarConfig(known_bounds=KnownBounds(minv=None, maxv=Value(1.0)), support_size=None, loss_decay=4.0) return MuZeroConfig(game_config=game_config, replay_buffer_config=replay_buffer_config, mcts_config=mcts_config, training_config=training_config, network_config=network_config, value_config=value_config, reward_config=reward_config)
def play_game(config: MuZeroConfig, storage: SharedStorage, train: bool = True, visual: bool = False, queue: Queue = None) -> AbstractGame: """ Each game is produced by starting at the initial board position, then repeatedly executing a Monte Carlo Tree Search to generate moves until the end of the game is reached. """ if queue: network = storage.latest_network_for_process() else: network = storage.current_network start = time() game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the networks. run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) if visual: game.env.render() if visual: if game.terminal(): print('Model lost game') else: print('Exceeded max moves') game.env.close() if queue: queue.put(game) print("Finished game episode after " + str(time() - start) + " seconds. Exceeded max moves? " + str(not game.terminal())) print("Score: ", sum(game.rewards)) return game
def select_action(config: MuZeroConfig, num_moves: int, node: Node): visit_counts = [(child.visit_count, action) for action, child in node.children.items()] temperature = config.visit_softmax_temperature_fn(num_moves=num_moves) if temperature == 0: action_pos = np.argmax([v for v, _ in visit_counts]) else: action_probs = [ visit_count_i**(1 / temperature) for visit_count_i, _ in visit_counts ] total_count = sum(action_probs) action_probs = [x / total_count for x in action_probs] action_pos = np.random.choice(len(visit_counts), p=action_probs) return visit_counts[action_pos][1]
def select_action(config: MuZeroConfig, num_moves: int, node: Node, network: BaseNetwork, mode: str = 'softmax'): """ After running simulations inside in MCTS, we select an action based on the root's children visit counts. During training we use a softmax sample for exploration. During evaluation we select the most visited child. """ visit_counts = [child.visit_count for child in node.children.values()] actions = [action for action in node.children.keys()] action = None if mode == 'softmax': t = config.visit_softmax_temperature_fn( num_moves=num_moves, training_steps=network.training_steps) action = softmax_sample(visit_counts, actions, t) elif mode == 'max': action, _ = max(node.children.items(), key=lambda item: item[1].visit_count) return action
def play_game(config: MuZeroConfig, network: AbstractNetwork, train: bool = True) -> AbstractGame: game = config.new_game() mode_action_select = 'softmax' if train else 'max' while not game.terminal() and len(game.history) < config.max_moves: root = Node(0) current_observation = game.make_image(-1) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) if train: add_exploration_noise(config, root) run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root, network, mode=mode_action_select) game.apply(action) game.store_search_statistics(root) return game
def play_game(config: MuZeroConfig, network: Network) -> Game: game = config.new_game() while not game.terminal and len(game.history) < config.max_moves: # At the root of the search tree we use the representation function to # obtain a hidden state given the current observation. root = Node(0) current_observation = game.make_image(-1, network.device) expand_node(root, game.to_play(), game.legal_actions(), network.initial_inference(current_observation)) add_exploration_noise(config, root) # We then run a Monte Carlo Tree Search using only action sequences and the # model learned by the network. run_mcts(config, root, game.action_history(), network) action = select_action(config, len(game.history), root) game.apply(action) game.store_search_statistics(root) return game
def make_config(environment: Env) -> MuZeroConfig: return MuZeroConfig( env=environment, state_space_size=int(np.prod(env.observation_space.shape)), action_space_size=env.action_space.n, max_moves=500, # Half an hour at action repeat 4. discount=0.997, dirichlet_alpha=0.25, num_simulations=15, # Number of future moves self-simulated batch_size=64, td_steps= 10, # Number of steps in the future to take into account for calculating the target value num_actors=4, training_steps=int(1e8), # Max number of training steps checkpoint_interval=100, save_interval=10000, lr_init=1e-4, lr_decay_steps=1000, lr_decay_rate=0.9)
def make_lunarlander_config(): def visit_softmax_temperature(num_moves, training_steps): if num_moves < 0.5 * training_steps: return 1.0 elif num_moves < 0.75 * training_steps: return 0.5 else: return 0.25 return MuZeroConfig( game=LunarLander, action_space_size=4, max_moves=500, discount=0.997, dirichlet_alpha=0.25, num_simulations=50, num_training_loop=50, num_epochs=200000, batch_size=32, td_steps=50, num_train_episodes=30, num_eval_episodes=10, lr_init=0.05, lr_decay_steps=1000, max_priority=False, visit_softmax_temperature_fn=visit_softmax_temperature, network_args={'support_size': 10, 'encoding_size': 10, 'rep_hidden': [], 'dyn_hidden': [64], 'rew_hidden': [64], 'val_hidden': [64], 'pol_hidden': [], 'observation_shape': (1, 1, 8), }, result_path="lunarlander.weights" )
def muzero(config: MuZeroConfig, save_directory: str, load_directory: str, test: bool, visual: bool, new_config: bool): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ config.load_directory = load_directory config.save_directory = save_directory replay_buffer = ReplayBuffer(config) # Remove old checkpoint network base_dir = os.path.dirname(os.path.realpath(__file__)) d = base_dir + '/checkpoint' to_remove = [os.path.join(d, f) for f in os.listdir(d)] for f in to_remove: if f.split('/')[-1] != '.gitignore': os.remove(f) if load_directory: # Copy load directory to checkpoint directory copy_tree(src=load_directory, dst=d) if new_config: network = config.new_network() SharedStorage.save_network_to_disk(network, config, None, 'blank_network') exit(0) if test: if load_directory is not None: # User specified directory to load network from network = config.old_network(load_directory) else: network = config.new_network() storage = SharedStorage(network, config.uniform_network(), config.new_optimizer(), save_directory, config, load_directory != None) # Single process for simple testing, can refactor later print("Eval score:", run_eval(config, storage, 5, visual=visual)) print(f"MuZero played {5} " f"episodes.\n") return storage.latest_network() for loop in range(config.nb_training_loop): initial = True if loop == 0 else False start = time() o_start = time() print("Training loop", loop) episodes = config.nb_episodes score_train = multiprocess_play_game(config, initial=initial, episodes=episodes, train=True, replay_buffer=replay_buffer) print("Self play took " + str(time() - start) + " seconds") print("Train score: " + str(score_train) + " after " + str(time() - start) + " seconds") start = time() print("Training network...") train_network(config, replay_buffer, config.nb_epochs) print("Network weights updated after " + str(time() - start) + " seconds") """