def evaluate(environment_config, agent_config, options): """ Evaluate an agent interacting with an environment. :param environment_config: the path of the environment configuration file :param agent_config: the path of the agent configuration file :param options: the evaluation options """ logger.configure(LOGGING_CONFIG) if options['--verbose']: logger.configure(VERBOSE_CONFIG) env = load_environment(environment_config) agent = load_agent(agent_config, env) run_directory = Path(agent_config).with_suffix( '').name if options['--name-from-config'] else None options['--seed'] = int( options['--seed']) if options['--seed'] is not None else None evaluation = Evaluation(env, agent, run_directory=run_directory, num_episodes=int(options['--episodes']), sim_seed=options['--seed'], recover=options['--recover'] or options['--recover-from'], display_env=not options['--no-display'], display_agent=not options['--no-display'], display_rewards=not options['--no-display']) if options['--train']: evaluation.train() elif options['--test']: evaluation.test() else: evaluation.close() return os.path.relpath(evaluation.monitor.directory)
def evaluate(environment_config, agent_config, options): """ Evaluate an agent interacting with an environment. :param environment_config: the path of the environment configuration file :param agent_config: the path of the agent configuration file :param options: the evaluation options """ gym.logger.set_level( gym.logger.DEBUG if options['--verbose'] else gym.logger.INFO) env = load_environment(environment_config) agent = load_agent(agent_config, env) run_directory = Path(agent_config).with_suffix( '').name if options['--name-from-config'] else None options['--seed'] = int( options['--seed']) if options['--seed'] is not None else None evaluation = Evaluation(env, agent, run_directory=run_directory, num_episodes=int(options['--episodes']), sim_seed=options['--seed'], recover=options['--recover'], display_env=not options['--no-display'], display_agent=not options['--no-display'], display_rewards=not options['--no-display']) if options['--train']: evaluation.train() elif options['--test']: evaluation.test() else: evaluation.close() if options['--analyze'] and not options['<benchmark>']: RunAnalyzer([evaluation.monitor.directory]) return os.path.relpath(evaluation.monitor.directory)
def make_configure_env(**kwargs): # env = gym.make(kwargs["id"]) # environment_config = "exp_merge_complex_base_ma.json" environment_config = "exp_merge_complex_sa.json" env = load_environment(environment_config) env.configure(env_kwargs["config"]) env.reset() return env
def evaluate(experiment): # Prepare workspace seed, budget, agent_config, env_config, path = experiment gym.logger.set_level(gym.logger.DISABLED) path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Make environment env_name, env_config = env_config env = load_environment(env_config) # Make agent agent_name, agent_config = agent_config agent_config["budget"] = int(budget) agent = agent_factory(env, agent_config) # Evaluate print("Evaluating agent {} with budget {} on seed {}".format( agent_name, budget, seed)) evaluation = Evaluation(env, agent, directory=Path("out") / "planners" / agent_name, num_episodes=1, sim_seed=seed, display_env=False, display_agent=False, display_rewards=False) evaluation.test() rewards = evaluation.monitor.stats_recorder.episode_rewards_[0] length = evaluation.monitor.stats_recorder.episode_lengths[0] total_reward = np.sum(rewards) return_ = np.sum([gamma**t * rewards[t] for t in range(len(rewards))]) # Save results result = { "env": env_name, "agent": agent_name, "budget": budget, "seed": seed, "total_reward": total_reward, "return": return_, "length": length } if race_strategy: result["pit_count"] = evaluation.pits df = pd.DataFrame.from_records([result]) with open(path, 'a') as f: df.to_csv(f, sep=',', encoding='utf-8', header=f.tell() == 0, index=False)
def collect_samples(environment_config, agent_config, count, start_time, seed, model_path, batch): """ Collect interaction samples of an agent / environment pair. Note that the last episode may not terminate, when enough samples have been collected. :param dict environment_config: the environment configuration :param dict agent_config: the agent configuration :param int count: number of samples to collect :param start_time: the initial local time of the agent :param seed: the env/agent seed :param model_path: the path to load the agent model from :param batch: index of the current batch :return: a list of trajectories, i.e. lists of Transitions """ env = load_environment(environment_config) env.seed(seed) if batch == 0: # Force pure exploration during first batch agent_config["exploration"]["final_temperature"] = 1 agent_config["device"] = "cpu" agent = load_agent(agent_config, env) agent.load(model_path) agent.seed(seed) agent.set_time(start_time) state = env.reset() episodes = [] trajectory = [] for _ in range(count): action = agent.act(state) next_state, reward, done, info = env.step(action) trajectory.append( Transition(state, action, reward, next_state, done, info)) if done: state = env.reset() episodes.append(trajectory) trajectory = [] else: state = next_state if trajectory: # Unfinished episode episodes.append(trajectory) env.close() return episodes
def evaluate(experiment): # Prepare workspace seed, accuracy, agent_config, env_config, path = experiment gym.logger.set_level(gym.logger.DISABLED) path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Make environment env = load_environment(env_config) # Make agent agent_name, agent_config = agent_config agent_config["accuracy"] = float(accuracy) agent_config["budget"] = 10**9 agent = agent_factory(env, agent_config) logger.debug("Evaluating agent {} with budget {} on seed {}".format(agent_name, budget, seed)) # Compute true value env.seed(seed) observation = env.reset() vi = agent_factory(env, agent_configs()["value_iteration"]) best_action = vi.act(observation) action = agent.act(observation) q = vi.state_action_value simple_regret = q[vi.mdp.state, best_action] - q[vi.mdp.state, action] gap = q[vi.mdp.state, best_action] - np.sort(q[vi.mdp.state, :])[-2] if hasattr(agent.planner, "budget_used"): budget = agent.planner.budget_used # Save results result = { "agent": agent_name, "budget": budget, "accuracy": agent.planner.config["accuracy"], "horizon": agent.planner.config["horizon"], "seed": seed, "simple_regret": simple_regret, "gap": gap } df = pd.DataFrame.from_records([result]) with open(path, 'a') as f: df.to_csv(f, sep=',', encoding='utf-8', header=f.tell() == 0, index=False)
max_depth=100).plot(out / "{}.svg".format(agent_name), title=agent_name) plt.show() def compare_trajs(env, seed=0): trajs = {} for agent_name in agents.keys(): env.seed(seed) env.reset() agent = evaluate(env, agent_name, seed=seed) trajs[agent_name] = get_trajs(agent.planner.root, env) palette = itertools.cycle(sns.color_palette()) for agent, agent_trajs in trajs.items(): color = next(palette) for traj in agent_trajs: x, y = zip(*traj) plt.plot(x, y, color=color, linestyle='dotted', linewidth=0.5) plt.savefig(out / "trajectories.png") plt.show() if __name__ == "__main__": gym.logger.set_level(gym.logger.DEBUG) # env = DynamicsEnv() # env = gym.make("highway-v0") env = load_environment(env_zero_one) compare_trees(env, seed=5)
Prepare environment, agent, and evaluation process. We use a policy architecture based on social attention, see [[Leurent and Mercat, 2019]](https://arxiv.org/abs/1911.12250). """ # Commented out IPython magic to ensure Python compatibility. from rl_agents.trainer.evaluation import Evaluation from rl_agents.agents.common.factory import load_agent, load_environment # Get the environment and agent configurations from the rl-agents repository # %cd /content/rl-agents/scripts/ env_config = 'configs/IntersectionEnv/env.json' agent_config = 'configs/IntersectionEnv/agents/DQNAgent/ego_attention_2h.json' env = load_environment(env_config) agent = load_agent(agent_config, env) evaluation = Evaluation(env, agent, num_episodes=3000, display_env=False) print(f"Ready to train {agent} on {env}") """Run tensorboard locally to visualize training.""" # Commented out IPython magic to ensure Python compatibility. # %tensorboard --logdir "{evaluation.directory}" """Start training. This should take about an hour.""" evaluation.train() """Progress can be visualised in the tensorboard cell above, which should update every 30s (or manually). You may need to click the *Fit domain to data* buttons below each graph. ## Testing Run the learned policy for a few episodes.
if not axes: fig, axes = plt.subplots() for trajectory in trajectories: x, y = zip(*trajectory) plt.plot(x, y, linestyle='dotted', linewidth=0.5, label=agent_name, color=color) return axes if __name__ == "__main__": configure("configs/verbose.json", gym_level=gym.logger.DEBUG) selected_env = load_environment(envs["gridenv"]) selected_agents = [ # "deterministic", "state_aware", # "kl-olop" ] selected_agents = {k: v for k, v in agents.items() if k in selected_agents} budget = 4 * (4**6 - 1) / (4 - 1) # budget = 200 compare_agents(selected_env, selected_agents, budget=budget, show_tree=True, show_states=True, show_trajs=False)
def evaluate(experiment): # Prepare workspace seed, budget, agent_config, env_config, path = experiment gym.logger.set_level(gym.logger.DISABLED) path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Make environment env = load_environment(env_config) # Make agent agent_name, agent_config = agent_config agent_config["budget"] = int(budget) agent = agent_factory(env, agent_config) logger.debug("Evaluating agent {} with budget {} on seed {}".format( agent_name, budget, seed)) # Compute true value compute_regret = True compute_return = False if compute_regret: env.seed(seed) observation = env.reset() vi = agent_factory(env, agent_configs()["value_iteration"]) best_action = vi.act(observation) action = agent.act(observation) q = vi.state_action_value simple_regret = q[vi.mdp.state, best_action] - q[vi.mdp.state, action] gap = q[vi.mdp.state, best_action] - np.sort(q[vi.mdp.state, :])[-2] else: simple_regret = 0 gap = 0 if compute_return: # Evaluate evaluation = Evaluation(env, agent, directory=Path("out") / "planners" / agent_name, num_episodes=1, sim_seed=seed, display_env=False, display_agent=False, display_rewards=False) evaluation.test() rewards = evaluation.monitor.stats_recorder.episode_rewards_[0] length = evaluation.monitor.stats_recorder.episode_lengths[0] total_reward = np.sum(rewards) cum_discount = lambda signal: np.sum( [gamma**t * signal[t] for t in range(len(signal))]) return_ = cum_discount(rewards) mean_return = np.mean( [cum_discount(rewards[t:]) for t in range(len(rewards))]) else: length = 0 total_reward = 0 return_ = 0 mean_return = 0 # Save results result = { "agent": agent_name, "budget": budget, "seed": seed, "total_reward": total_reward, "return": return_, "mean_return": mean_return, "length": length, "simple_regret": simple_regret, "gap": gap } df = pd.DataFrame.from_records([result]) with open(path, 'a') as f: df.to_csv(f, sep=',', encoding='utf-8', header=f.tell() == 0, index=False)
def evaluate(experiment): # Prepare workspace seed, agent_config, env_config, path = experiment gym.logger.set_level(gym.logger.DISABLED) path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Make environment env = load_environment(env_config) # Make agent agent_name, agent_config = agent_config agent = load_agent(agent_config, env) # Evaluate print("Evaluating agent {} on seed {}".format(agent_name, seed)) evaluation = Evaluation(env, agent, directory=path.parent / agent_name, num_episodes=1, sim_seed=seed, display_env=True, display_agent=True, display_rewards=False) estimate_value = False if estimate_value: rewards, values, terminal = [], [], False evaluation.seed(episode=0) evaluation.reset() evaluation.training = False gamma = 0.99 or agent.config["gamma"] while not terminal: # Estimate state value oracle_env = safe_deepcopy_env(agent.env) oracle = load_agent(agent_configs()["oracle"], oracle_env) oracle_done, oracle_rewards = False, [] while not oracle_done: action = oracle.act(None) _, oracle_reward, oracle_done, _ = oracle_env.step(action) oracle_rewards.append(oracle_reward) value = np.sum([ gamma**t * oracle_rewards[t] for t in range(len(oracle_rewards)) ]) values.append(value) reward, terminal = evaluation.step() rewards.append(reward) evaluation.close() returns = [ np.sum( [gamma**t * rewards[k + t] for t in range(len(rewards[k:]))]) for k in range(len(rewards)) ] # Save intermediate results df = pd.DataFrame({ "agent": agent_name, "time": range(len(rewards)), "seed": [seed] * len(rewards), "reward": rewards, "return": returns, "value": values }) else: evaluation.test() rewards = evaluation.monitor.stats_recorder.episode_rewards_[0] length = evaluation.monitor.stats_recorder.episode_lengths[0] total_reward = np.sum(rewards) cum_discount = lambda signal, gamma: np.sum( [gamma**t * signal[t] for t in range(len(signal))]) return_ = cum_discount(rewards, 0.9) return_undisc = cum_discount(rewards, 0.99) result = { "agent": agent_name, "seed": seed, "total_reward": total_reward, "return": return_, "return_undisc": return_undisc, "length": length, } df = pd.DataFrame.from_records([result]) with open(path, 'a') as f: df.to_csv(f, sep=',', encoding='utf-8', header=f.tell() == 0, index=False)
def evaluate(environment_config, agent_config, options): """ Evaluate an agent interacting with an environment. :param environment_config: the path of the environment configuration file :param agent_config: the path of the agent configuration file :param options: the evaluation options """ logger.configure(LOGGING_CONFIG) if options['--verbose']: logger.configure(VERBOSE_CONFIG) run_directory = None if options['--name-from-config']: run_directory = "{}_{}_{}".format( Path(agent_config).with_suffix('').name, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), os.getpid()) options['--seed'] = int( options['--seed']) if options['--seed'] is not None else None env = load_environment(environment_config) if agent_config == "None": agent_config = env.config["agent_config"] if "auto_tau" in agent_config["exploration"] and ( agent_config["exploration"]["auto_tau"]): agent_config["exploration"]["tau"] = env.config[ "policy_frequency"] * env.config["duration"] * int( options['--episodes'] * env.config["controlled_vehicles"]) / 50 agent = load_agent(agent_config, env) # TODO diferent display options for agent, env, rewards if options['--offscreen_rendering']: env.config['offscreen_rendering'] = True evaluation_train = Evaluation(env, agent, run_directory=run_directory, num_episodes=int(options['--episodes']), sim_seed=options['--seed'], recover=options['--recover'] or options['--recover-from'], display_env=not options['--no-display'], display_agent=not options['--no-display'], display_rewards=not options['--no-display'], training=options['--train'], options=options) if options['--train']: evaluation_train.train() else: evaluation_train.close() if options['--test']: agent_test = load_agent(agent_config, env) if options['--train']: agent_test = evaluation_train.agent evaluation_test = Evaluation( env, agent_test, run_directory=run_directory, num_episodes=int(options['--episodes_test']), sim_seed=options['--seed'], recover=options['--recover'] or options['--recover-from'], display_env=not options['--no-display'], display_agent=not options['--no-display'], display_rewards=not options['--no-display'], training=False, options=options) evaluation_test.test()