示例#1
0
文件: coach.py 项目: Matioz/AlphaZero
    def evaluate(self, desc="Evaluation", tournament_mode=False, render_mode=False, n_games=None):
        """Evaluation phase, check how good current mind is.

        Args:
            desc (str): Progress bar description.
            tournament_mode (bool): If current agent should be compared to best too or only evaluated.
            render_mode (bool): Enable rendering game. (Default: False)
            n_games (int): How many games to play. If None, then value is taken from config.
                (Default: None)

        Note:
            `self.scoreboard` should measure and keep performance of mind
            from last call to `hrl.loop`.
        """
        # Clear current agent tree and evaluate it
        self.current_mind.clear_tree()
        hrl.loop(self.env, self.current_mind, self.interpreter, policy='deterministic', train_mode=False,
                 debug_mode=self.cfg.debug, render_mode=render_mode,
                 n_episodes=n_games if n_games else self.cfg.self_play['n_tournaments'], name=desc,
                 verbose=2, callbacks=[self.scoreboard, *self.eval_callbacks])

        if tournament_mode:
            current_score, is_better = self.scoreboard.unwrapped.compare(self.best_score)
            if is_better:
                self._update_best(current_score)
示例#2
0
    def test_loop(self, env, mind, callback):
        train_mode = False

        loop(env, mind, train_mode=train_mode, verbose=0, callbacks=[callback])

        env.reset.assert_called_once_with(train_mode)
        callback.on_loop_start.assert_called_once_with()
        callback.on_episode_start.assert_called_once_with(0, train_mode)
        callback.on_episode_end.assert_called_once_with(0, train_mode)
        callback.on_loop_end.assert_called_once_with(False)
示例#3
0
 def test_loop_with_exception_during_execution(self, env, mind,
                                               invalid_callback):
     train_mode = False
     handled_exception = False
     try:
         loop(env,
              mind,
              train_mode=train_mode,
              verbose=0,
              callbacks=[invalid_callback])
     except BaseException:
         handled_exception = True
     invalid_callback.on_loop_end.assert_called_once_with(True)
     assert handled_exception
示例#4
0
文件: coach.py 项目: Matioz/AlphaZero
    def play(self, desc="Play"):
        """Self-play phase, gather data using best nn and save to storage.

        Args:
            desc (str): Progress bar description.
        """

        hrl.loop(self.env, self.best_mind, self.interpreter, policy='proportional', trian_mode=True,
                 warmup=self.cfg.self_play['policy_warmup'],
                 debug_mode=self.cfg.debug, n_episodes=self.cfg.self_play['n_self_plays'],
                 name=desc, verbose=1,
                 callbacks=[self.best_mind, self.storage, *self.play_callbacks])

        # Store gathered data
        self.storage.store()
示例#5
0
def record_data(ctx, path, n_games, chunk_size, state_dtype):
    """Plays chosen game randomly and records transitions to hdf5 file in `PATH`."""

    config = obtain_config(ctx)

    # Create Gym environment, random agent and store to hdf5 callback
    env = hrl.create_gym(config.general['game_name'])
    mind, agent_callbacks = create_generating_agent(
        config.general['generating_agent'], env)
    store_callback = StoreTransitions(path,
                                      config.general['state_shape'],
                                      env.action_space,
                                      chunk_size=chunk_size,
                                      state_dtype=state_dtype,
                                      reward_dtype=np.float32)
    callbacks = agent_callbacks + [store_callback]

    if store_callback.game_count >= n_games:
        log.warning(
            "Data is already fully present in dataset you specified! If you wish to create"
            " a new dataset, please remove the one under this path or specify a different"
            " path. If you wish to gather more data, increase the number of games to "
            " record with --n-games parameter.")
        return
    elif 0 < store_callback.game_count < n_games:
        diff = n_games - store_callback.game_count
        log.info(
            "{}/{} games were already recorded in specified dataset. {} more game will be"
            " added!".format(store_callback.game_count, n_games, diff))
        n_games = diff

    # Resizes states to `state_shape` with cropping
    interpreter = BasicInterpreter(state_shape=config.general['state_shape'],
                                   crop_range=config.general['crop_range'],
                                   scale=255)

    # Play `N` random games and gather data as it goes
    hrl.loop(env,
             mind,
             interpreter,
             n_episodes=n_games,
             verbose=1,
             callbacks=callbacks,
             render_mode=config.allow_render)
示例#6
0
def eval(ctx, controller_path, vae_path, mdn_path, n_games):
    """Plays chosen game testing whole pipeline: VAE -> MDN-RNN -> Controller
    (loaded from `vae_path`, `mdn_path` and `controller_path`)."""

    config = obtain_config(ctx)

    # Get action space size
    env = hrl.create_gym(config.general['game_name'])

    # Create VAE + MDN-RNN interpreter
    _, encoder, _ = build_vae_model(config.vae, config.general['state_shape'],
                                    vae_path)

    rnn = build_rnn_model(config.rnn, config.vae['latent_space_dim'],
                          env.action_space, mdn_path)

    basic_interpreter = BasicInterpreter(
        state_shape=config.general['state_shape'],
        crop_range=config.general['crop_range'])
    mdn_interpreter = MDNInterpreter(encoder, rnn.model,
                                     config.vae['latent_space_dim'])

    # Build CMA-ES solver and linear model
    mind = build_mind(
        config.es, config.vae['latent_space_dim'] + config.rnn['hidden_units'],
        env.action_space, controller_path)

    hist = hrl.loop(env,
                    mind,
                    ChainInterpreter(basic_interpreter, mdn_interpreter),
                    n_episodes=n_games,
                    render_mode=config.allow_render,
                    verbose=1,
                    callbacks=[ReturnTracker(), mdn_interpreter])

    print("Returns:", *hist['return'])
    print("Avg. return:", np.mean(hist['return']))
示例#7
0
    @property
    def metrics(self):
        return {"avg. return": self.running_avg}

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='HumbleRL tabular Q-Learning sample')
    parser.add_argument('--episodes', type=int, default=865, metavar='N',
                        help='number of episodes to train (default: 865)')
    parser.add_argument('--lr', type=float, default=0.75, metavar='LR',
                        help='learning rate (default: 0.75)')
    parser.add_argument('--decay', type=int, default=400, metavar='N',
                        help='exploration decay steps (default: 400)')
    parser.add_argument('--gamma', type=float, default=0.95, metavar='G',
                        help='discount factor (default: 0.95)')
    args = parser.parse_args()

    # Create environment and q-learning agent
    env = hrl.create_gym("FrozenLake-v0")
    mind = TabularQLearning(env.state_space, env.action_space.num,
                            learning_rate=args.lr,
                            decay_steps=args.decay,
                            discount_factor=args.gamma)

    # Seed env and numpy
    np.random.seed(7)
    env.env.seed(7)

    # Run training
    hrl.loop(env, mind, n_episodes=args.episodes, callbacks=[mind])
示例#8
0
def cross_play(ctx, checkpoints_dir, gap, second_config):
    """Validate trained models. Best networks play with each other."""

    cfg = ctx.obj
    second_cfg = Config(second_config) if second_config is not None else cfg

    # Create board games interpreter
    interpreter = BoardInterpreter(cfg.game)

    # Set checkpoints_dir if not passed
    if checkpoints_dir is None:
        checkpoints_dir = cfg.logging['save_checkpoint_folder']

    # Create players and their minds
    first_player_trainer = build_keras_trainer(cfg.game, cfg)
    second_player_trainer = build_keras_trainer(second_cfg.game, second_cfg)
    first_player = Planner(cfg.mdp, first_player_trainer.model, cfg.planner)
    second_player = Planner(second_cfg.mdp, second_player_trainer.model,
                            second_cfg.planner)
    players = AdversarialMinds(first_player, second_player)

    # Create callbacks
    tournament = Tournament()

    # Get checkpoints paths
    all_checkpoints_paths = utils.get_checkpoints_for_game(
        checkpoints_dir, cfg.self_play["game"])

    # Reduce gap to play at least one game when there is more than one checkpoint
    if gap >= len(all_checkpoints_paths):
        gap = len(all_checkpoints_paths) - 1
        log.info("Gap is too big. Reduced to %d", gap)

    # Gather players ids and checkpoints paths for cross-play
    players_ids = []
    checkpoints_paths = []
    for idx in range(0, len(all_checkpoints_paths), gap):
        players_ids.append(idx)
        checkpoints_paths.append(all_checkpoints_paths[idx])

    # Create table for results, extra column for player id
    results = np.zeros((len(checkpoints_paths), len(checkpoints_paths)),
                       dtype=int)

    # Create ELO scoreboard
    elo = ELOScoreboard(players_ids)

    for i, (first_player_id, first_checkpoint_path) in enumerate(
            zip(players_ids, checkpoints_paths)):
        first_player_trainer.load_checkpoint(first_checkpoint_path)

        tournament_wins = tournament_draws = 0
        opponents_elo = []
        for j in range(i + 1, len(players_ids)):
            second_player_id, second_checkpoint_path = players_ids[
                j], checkpoints_paths[j]
            second_player_trainer.load_checkpoint(second_checkpoint_path)

            # Clear players tree
            first_player.clear_tree()
            second_player.clear_tree()

            hrl.loop(cfg.env,
                     players,
                     interpreter,
                     policy='deterministic',
                     n_episodes=2,
                     train_mode=False,
                     name="{} vs {}".format(first_player_id, second_player_id),
                     callbacks=[tournament, cfg.env])

            wins, losses, draws = tournament.results

            # Book keeping
            tournament_wins += wins
            tournament_draws += draws

            results[i][j] = wins - losses
            results[j][i] = losses - wins

            opponents_elo.append(elo.scores.loc[second_player_id, 'elo'])

            # Update ELO rating of second player
            elo.update_player(second_player_id, elo.scores.loc[first_player_id,
                                                               'elo'], losses,
                              draws)

        # Update ELO rating of first player
        elo.update_player(first_player_id, opponents_elo, tournament_wins,
                          tournament_draws)

    # Save elo to csv
    elo.save_csv(cfg.logging['save_elo_scoreboard_path'])

    scoreboard = np.concatenate((np.array(players_ids).reshape(
        -1, 1), results, np.sum(results, axis=1).reshape(
            -1, 1), elo.scores.elo.values.reshape(-1, 1).astype(np.int)),
                                axis=1)

    tab = tabulate(scoreboard,
                   headers=players_ids + ["sum", "elo"],
                   tablefmt="fancy_grid")
    log.info("Results:\n%s", tab)
    for player_id, player_elo, checkpoint_path in zip(players_ids,
                                                      elo.scores['elo'],
                                                      checkpoints_paths):
        log.info("ITER: %3d, ELO: %4d, PATH: %s", player_id, int(player_elo),
                 checkpoint_path)
示例#9
0
            jobs=population,
            processes=args.processes,
            n_episodes=5,
            verbose=0
        )
        returns = [np.mean(hist['return']) for hist in hists]

        # Print logs and update best return
        pbar.set_postfix(best=best_return, current=max(returns))
        best_return = max(best_return, max(returns))

        # Update solver
        solver.tell(returns)

        if args.ckpt:
            # Save solver in given path
            solver.save_ckpt(args.ckpt)
            log.debug("Saved checkpoint in path: %s", args.ckpt)

        if args.render:
            # Evaluate current parameters with render
            mind.set_weights(solver.current_param())
            history = hrl.loop(env, mind, render_mode=True, verbose=0, callbacks=[ReturnTracker()])
            log.info("Current parameters (weights) return: %f.", history['return'][0])

    # Yea, wrapper on wrapper :| Please see this: https://github.com/openai/gym/issues/893
    env.env.env.close()

    # If environment wasn't solved then exit with error
    assert best_return == 200, "Environment wasn't solved!"