def muzero(config: MuZeroConfig): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer()) replay_buffer = ReplayBuffer(config) for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) print("Eval score:", run_eval(config, storage, NUM_EVAL_EPISODES)) print( f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n" ) return storage.latest_network()
def muzero(config: MuZeroConfig): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ network = config.new_network() storage = SharedStorage(network, config.uniform_network(), config.new_optimizer(network)) replay_buffer = ReplayBuffer(config) train_scores = [] eval_scores = [] train_losses = [] for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_losses += train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) score_eval = run_eval(config, storage, 50) print("Eval score:", score_eval) print( f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n" ) train_scores.append(score_train) eval_scores.append(score_eval) plt.figure(1) plt.plot(train_scores) plt.plot(eval_scores) plt.title('MuZero Average Rewards') plt.xlabel('MuZero Iterations (Train/Eval)') plt.ylabel('Reward Score') plt.legend(['Train score', 'Eval score']) plt.figure(2) plt.plot(train_losses, color='green') plt.title('MuZero Training Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.show() return storage.latest_network()
def muzero(config: MuZeroConfig): storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer()) replay_buffer = ReplayBuffer(config) for loop in range(config.nb_training_loop): print("Training loop", loop) score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes) train_network(config, storage, replay_buffer, config.nb_epochs) print("Train score:", score_train) print("Eval score:", run_eval(config, storage, 50)) print(f"MuZero played {config.nb_episodes * (loop + 1)} " f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n") storage.save_network_dir(config.nb_training_loop) return storage.latest_network()
def muzero(config: MuZeroConfig, save_directory: str, load_directory: str, test: bool, visual: bool, new_config: bool): """ MuZero training is split into two independent parts: Network training and self-play data generation. These two parts only communicate by transferring the latest networks checkpoint from the training to the self-play, and the finished games from the self-play to the training. In contrast to the original MuZero algorithm this version doesn't works with multiple threads, therefore the training and self-play is done alternately. """ config.load_directory = load_directory config.save_directory = save_directory replay_buffer = ReplayBuffer(config) # Remove old checkpoint network base_dir = os.path.dirname(os.path.realpath(__file__)) d = base_dir + '/checkpoint' to_remove = [os.path.join(d, f) for f in os.listdir(d)] for f in to_remove: if f.split('/')[-1] != '.gitignore': os.remove(f) if load_directory: # Copy load directory to checkpoint directory copy_tree(src=load_directory, dst=d) if new_config: network = config.new_network() SharedStorage.save_network_to_disk(network, config, None, 'blank_network') exit(0) if test: if load_directory is not None: # User specified directory to load network from network = config.old_network(load_directory) else: network = config.new_network() storage = SharedStorage(network, config.uniform_network(), config.new_optimizer(), save_directory, config, load_directory != None) # Single process for simple testing, can refactor later print("Eval score:", run_eval(config, storage, 5, visual=visual)) print(f"MuZero played {5} " f"episodes.\n") return storage.latest_network() for loop in range(config.nb_training_loop): initial = True if loop == 0 else False start = time() o_start = time() print("Training loop", loop) episodes = config.nb_episodes score_train = multiprocess_play_game(config, initial=initial, episodes=episodes, train=True, replay_buffer=replay_buffer) print("Self play took " + str(time() - start) + " seconds") print("Train score: " + str(score_train) + " after " + str(time() - start) + " seconds") start = time() print("Training network...") train_network(config, replay_buffer, config.nb_epochs) print("Network weights updated after " + str(time() - start) + " seconds") """