Exemplo n.º 1
0
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    storage = SharedStorage(config.new_network(), config.uniform_network(),
                            config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, NUM_EVAL_EPISODES))
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )

    return storage.latest_network()
def muzero(config: MuZeroConfig):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    network = config.new_network()
    storage = SharedStorage(network, config.uniform_network(),
                            config.new_optimizer(network))
    replay_buffer = ReplayBuffer(config)

    train_scores = []
    eval_scores = []
    train_losses = []
    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer,
                                   config.nb_episodes)
        train_losses += train_network(config, storage, replay_buffer,
                                      config.nb_epochs)
        print("Train score:", score_train)
        score_eval = run_eval(config, storage, 50)
        print("Eval score:", score_eval)
        print(
            f"MuZero played {config.nb_episodes * (loop + 1)} "
            f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n"
        )
        train_scores.append(score_train)
        eval_scores.append(score_eval)

    plt.figure(1)
    plt.plot(train_scores)
    plt.plot(eval_scores)
    plt.title('MuZero Average Rewards')
    plt.xlabel('MuZero Iterations (Train/Eval)')
    plt.ylabel('Reward Score')
    plt.legend(['Train score', 'Eval score'])

    plt.figure(2)
    plt.plot(train_losses, color='green')
    plt.title('MuZero Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()

    return storage.latest_network()
Exemplo n.º 3
0
def muzero(config: MuZeroConfig):
    storage = SharedStorage(config.new_network(), config.uniform_network(), config.new_optimizer())
    replay_buffer = ReplayBuffer(config)

    for loop in range(config.nb_training_loop):
        print("Training loop", loop)
        score_train = run_selfplay(config, storage, replay_buffer, config.nb_episodes)
        train_network(config, storage, replay_buffer, config.nb_epochs)

        print("Train score:", score_train)
        print("Eval score:", run_eval(config, storage, 50))
        print(f"MuZero played {config.nb_episodes * (loop + 1)} "
              f"episodes and trained for {config.nb_epochs * (loop + 1)} epochs.\n")

    storage.save_network_dir(config.nb_training_loop)

    return storage.latest_network()
Exemplo n.º 4
0
def muzero(config: MuZeroConfig, save_directory: str, load_directory: str,
           test: bool, visual: bool, new_config: bool):
    """
    MuZero training is split into two independent parts: Network training and
    self-play data generation.
    These two parts only communicate by transferring the latest networks checkpoint
    from the training to the self-play, and the finished games from the self-play
    to the training.
    In contrast to the original MuZero algorithm this version doesn't works with
    multiple threads, therefore the training and self-play is done alternately.
    """
    config.load_directory = load_directory
    config.save_directory = save_directory
    replay_buffer = ReplayBuffer(config)
    # Remove old checkpoint network
    base_dir = os.path.dirname(os.path.realpath(__file__))

    d = base_dir + '/checkpoint'
    to_remove = [os.path.join(d, f) for f in os.listdir(d)]
    for f in to_remove:
        if f.split('/')[-1] != '.gitignore':
            os.remove(f)

    if load_directory:
        # Copy load directory to checkpoint directory
        copy_tree(src=load_directory, dst=d)

    if new_config:
        network = config.new_network()
        SharedStorage.save_network_to_disk(network, config, None,
                                           'blank_network')
        exit(0)

    if test:
        if load_directory is not None:
            # User specified directory to load network from
            network = config.old_network(load_directory)
        else:
            network = config.new_network()
        storage = SharedStorage(network, config.uniform_network(),
                                config.new_optimizer(), save_directory, config,
                                load_directory != None)
        # Single process for simple testing, can refactor later
        print("Eval score:", run_eval(config, storage, 5, visual=visual))
        print(f"MuZero played {5} " f"episodes.\n")
        return storage.latest_network()

    for loop in range(config.nb_training_loop):
        initial = True if loop == 0 else False
        start = time()
        o_start = time()
        print("Training loop", loop)
        episodes = config.nb_episodes

        score_train = multiprocess_play_game(config,
                                             initial=initial,
                                             episodes=episodes,
                                             train=True,
                                             replay_buffer=replay_buffer)
        print("Self play took " + str(time() - start) + " seconds")
        print("Train score: " + str(score_train) + " after " +
              str(time() - start) + " seconds")

        start = time()
        print("Training network...")
        train_network(config, replay_buffer, config.nb_epochs)
        print("Network weights updated after " + str(time() - start) +
              " seconds")
        """