def evaluate(self, desc="Evaluation", tournament_mode=False, render_mode=False, n_games=None): """Evaluation phase, check how good current mind is. Args: desc (str): Progress bar description. tournament_mode (bool): If current agent should be compared to best too or only evaluated. render_mode (bool): Enable rendering game. (Default: False) n_games (int): How many games to play. If None, then value is taken from config. (Default: None) Note: `self.scoreboard` should measure and keep performance of mind from last call to `hrl.loop`. """ # Clear current agent tree and evaluate it self.current_mind.clear_tree() hrl.loop(self.env, self.current_mind, self.interpreter, policy='deterministic', train_mode=False, debug_mode=self.cfg.debug, render_mode=render_mode, n_episodes=n_games if n_games else self.cfg.self_play['n_tournaments'], name=desc, verbose=2, callbacks=[self.scoreboard, *self.eval_callbacks]) if tournament_mode: current_score, is_better = self.scoreboard.unwrapped.compare(self.best_score) if is_better: self._update_best(current_score)
def test_loop(self, env, mind, callback): train_mode = False loop(env, mind, train_mode=train_mode, verbose=0, callbacks=[callback]) env.reset.assert_called_once_with(train_mode) callback.on_loop_start.assert_called_once_with() callback.on_episode_start.assert_called_once_with(0, train_mode) callback.on_episode_end.assert_called_once_with(0, train_mode) callback.on_loop_end.assert_called_once_with(False)
def test_loop_with_exception_during_execution(self, env, mind, invalid_callback): train_mode = False handled_exception = False try: loop(env, mind, train_mode=train_mode, verbose=0, callbacks=[invalid_callback]) except BaseException: handled_exception = True invalid_callback.on_loop_end.assert_called_once_with(True) assert handled_exception
def play(self, desc="Play"): """Self-play phase, gather data using best nn and save to storage. Args: desc (str): Progress bar description. """ hrl.loop(self.env, self.best_mind, self.interpreter, policy='proportional', trian_mode=True, warmup=self.cfg.self_play['policy_warmup'], debug_mode=self.cfg.debug, n_episodes=self.cfg.self_play['n_self_plays'], name=desc, verbose=1, callbacks=[self.best_mind, self.storage, *self.play_callbacks]) # Store gathered data self.storage.store()
def record_data(ctx, path, n_games, chunk_size, state_dtype): """Plays chosen game randomly and records transitions to hdf5 file in `PATH`.""" config = obtain_config(ctx) # Create Gym environment, random agent and store to hdf5 callback env = hrl.create_gym(config.general['game_name']) mind, agent_callbacks = create_generating_agent( config.general['generating_agent'], env) store_callback = StoreTransitions(path, config.general['state_shape'], env.action_space, chunk_size=chunk_size, state_dtype=state_dtype, reward_dtype=np.float32) callbacks = agent_callbacks + [store_callback] if store_callback.game_count >= n_games: log.warning( "Data is already fully present in dataset you specified! If you wish to create" " a new dataset, please remove the one under this path or specify a different" " path. If you wish to gather more data, increase the number of games to " " record with --n-games parameter.") return elif 0 < store_callback.game_count < n_games: diff = n_games - store_callback.game_count log.info( "{}/{} games were already recorded in specified dataset. {} more game will be" " added!".format(store_callback.game_count, n_games, diff)) n_games = diff # Resizes states to `state_shape` with cropping interpreter = BasicInterpreter(state_shape=config.general['state_shape'], crop_range=config.general['crop_range'], scale=255) # Play `N` random games and gather data as it goes hrl.loop(env, mind, interpreter, n_episodes=n_games, verbose=1, callbacks=callbacks, render_mode=config.allow_render)
def eval(ctx, controller_path, vae_path, mdn_path, n_games): """Plays chosen game testing whole pipeline: VAE -> MDN-RNN -> Controller (loaded from `vae_path`, `mdn_path` and `controller_path`).""" config = obtain_config(ctx) # Get action space size env = hrl.create_gym(config.general['game_name']) # Create VAE + MDN-RNN interpreter _, encoder, _ = build_vae_model(config.vae, config.general['state_shape'], vae_path) rnn = build_rnn_model(config.rnn, config.vae['latent_space_dim'], env.action_space, mdn_path) basic_interpreter = BasicInterpreter( state_shape=config.general['state_shape'], crop_range=config.general['crop_range']) mdn_interpreter = MDNInterpreter(encoder, rnn.model, config.vae['latent_space_dim']) # Build CMA-ES solver and linear model mind = build_mind( config.es, config.vae['latent_space_dim'] + config.rnn['hidden_units'], env.action_space, controller_path) hist = hrl.loop(env, mind, ChainInterpreter(basic_interpreter, mdn_interpreter), n_episodes=n_games, render_mode=config.allow_render, verbose=1, callbacks=[ReturnTracker(), mdn_interpreter]) print("Returns:", *hist['return']) print("Avg. return:", np.mean(hist['return']))
@property def metrics(self): return {"avg. return": self.running_avg} if __name__ == "__main__": parser = argparse.ArgumentParser(description='HumbleRL tabular Q-Learning sample') parser.add_argument('--episodes', type=int, default=865, metavar='N', help='number of episodes to train (default: 865)') parser.add_argument('--lr', type=float, default=0.75, metavar='LR', help='learning rate (default: 0.75)') parser.add_argument('--decay', type=int, default=400, metavar='N', help='exploration decay steps (default: 400)') parser.add_argument('--gamma', type=float, default=0.95, metavar='G', help='discount factor (default: 0.95)') args = parser.parse_args() # Create environment and q-learning agent env = hrl.create_gym("FrozenLake-v0") mind = TabularQLearning(env.state_space, env.action_space.num, learning_rate=args.lr, decay_steps=args.decay, discount_factor=args.gamma) # Seed env and numpy np.random.seed(7) env.env.seed(7) # Run training hrl.loop(env, mind, n_episodes=args.episodes, callbacks=[mind])
def cross_play(ctx, checkpoints_dir, gap, second_config): """Validate trained models. Best networks play with each other.""" cfg = ctx.obj second_cfg = Config(second_config) if second_config is not None else cfg # Create board games interpreter interpreter = BoardInterpreter(cfg.game) # Set checkpoints_dir if not passed if checkpoints_dir is None: checkpoints_dir = cfg.logging['save_checkpoint_folder'] # Create players and their minds first_player_trainer = build_keras_trainer(cfg.game, cfg) second_player_trainer = build_keras_trainer(second_cfg.game, second_cfg) first_player = Planner(cfg.mdp, first_player_trainer.model, cfg.planner) second_player = Planner(second_cfg.mdp, second_player_trainer.model, second_cfg.planner) players = AdversarialMinds(first_player, second_player) # Create callbacks tournament = Tournament() # Get checkpoints paths all_checkpoints_paths = utils.get_checkpoints_for_game( checkpoints_dir, cfg.self_play["game"]) # Reduce gap to play at least one game when there is more than one checkpoint if gap >= len(all_checkpoints_paths): gap = len(all_checkpoints_paths) - 1 log.info("Gap is too big. Reduced to %d", gap) # Gather players ids and checkpoints paths for cross-play players_ids = [] checkpoints_paths = [] for idx in range(0, len(all_checkpoints_paths), gap): players_ids.append(idx) checkpoints_paths.append(all_checkpoints_paths[idx]) # Create table for results, extra column for player id results = np.zeros((len(checkpoints_paths), len(checkpoints_paths)), dtype=int) # Create ELO scoreboard elo = ELOScoreboard(players_ids) for i, (first_player_id, first_checkpoint_path) in enumerate( zip(players_ids, checkpoints_paths)): first_player_trainer.load_checkpoint(first_checkpoint_path) tournament_wins = tournament_draws = 0 opponents_elo = [] for j in range(i + 1, len(players_ids)): second_player_id, second_checkpoint_path = players_ids[ j], checkpoints_paths[j] second_player_trainer.load_checkpoint(second_checkpoint_path) # Clear players tree first_player.clear_tree() second_player.clear_tree() hrl.loop(cfg.env, players, interpreter, policy='deterministic', n_episodes=2, train_mode=False, name="{} vs {}".format(first_player_id, second_player_id), callbacks=[tournament, cfg.env]) wins, losses, draws = tournament.results # Book keeping tournament_wins += wins tournament_draws += draws results[i][j] = wins - losses results[j][i] = losses - wins opponents_elo.append(elo.scores.loc[second_player_id, 'elo']) # Update ELO rating of second player elo.update_player(second_player_id, elo.scores.loc[first_player_id, 'elo'], losses, draws) # Update ELO rating of first player elo.update_player(first_player_id, opponents_elo, tournament_wins, tournament_draws) # Save elo to csv elo.save_csv(cfg.logging['save_elo_scoreboard_path']) scoreboard = np.concatenate((np.array(players_ids).reshape( -1, 1), results, np.sum(results, axis=1).reshape( -1, 1), elo.scores.elo.values.reshape(-1, 1).astype(np.int)), axis=1) tab = tabulate(scoreboard, headers=players_ids + ["sum", "elo"], tablefmt="fancy_grid") log.info("Results:\n%s", tab) for player_id, player_elo, checkpoint_path in zip(players_ids, elo.scores['elo'], checkpoints_paths): log.info("ITER: %3d, ELO: %4d, PATH: %s", player_id, int(player_elo), checkpoint_path)
jobs=population, processes=args.processes, n_episodes=5, verbose=0 ) returns = [np.mean(hist['return']) for hist in hists] # Print logs and update best return pbar.set_postfix(best=best_return, current=max(returns)) best_return = max(best_return, max(returns)) # Update solver solver.tell(returns) if args.ckpt: # Save solver in given path solver.save_ckpt(args.ckpt) log.debug("Saved checkpoint in path: %s", args.ckpt) if args.render: # Evaluate current parameters with render mind.set_weights(solver.current_param()) history = hrl.loop(env, mind, render_mode=True, verbose=0, callbacks=[ReturnTracker()]) log.info("Current parameters (weights) return: %f.", history['return'][0]) # Yea, wrapper on wrapper :| Please see this: https://github.com/openai/gym/issues/893 env.env.env.close() # If environment wasn't solved then exit with error assert best_return == 200, "Environment wasn't solved!"