def evaluate(env, config, q_table, episode, render=False, output=True): """ Evaluate configuration of SARSA on given environment initialised with given Q-table :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param q_table (Dict[(Obs, Act), float]): Q-table mapping observation-action to Q-values :param episode (int): episodes of training completed :param render (bool): flag whether evaluation runs should be rendered :param output (bool): flag whether mean evaluation performance should be printed :return (float, float): mean and standard deviation of reward received over episodes """ eval_agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=0.0, alpha=config["alpha"], ) eval_agent.q_table = q_table episodic_rewards = [] for eps_num in range(config["eval_episodes"]): obs = env.reset() if render: env.render() sleep(1) episodic_reward = 0 done = False steps = 0 while not done and steps <= config["max_episode_steps"]: steps += 1 act = eval_agent.act(obs) n_obs, reward, done, info = env.step(act) if render: env.render() sleep(1) episodic_reward += reward obs = n_obs episodic_rewards.append(episodic_reward) mean_reward = np.mean(episodic_rewards) std_reward = np.std(episodic_rewards) if output: print( f"EVALUATION ({episode}/{CONFIG['total_eps']}): MEAN REWARD OF {mean_reward}" ) if mean_reward >= 0.9: print(f"EVALUATION: SOLVED") else: print(f"EVALUATION: NOT SOLVED!") return mean_reward, std_reward
def evaluate(env, config, q_table, render=False): """ Evaluate configuration of SARSA on given environment initialised with given Q-table :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param q_table (Dict[(Obs, Act), float]): Q-table mapping observation-action to Q-values :param render (bool): flag whether evaluation runs should be rendered :return (float, float, int): mean and standard deviation of return received over episodes, number of negative returns """ eval_agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=0.0, alpha=config["alpha"], ) eval_agent.q_table = q_table episodic_returns = [] for eps_num in range(config["eval_episodes"]): obs = env.reset() if render: env.render() sleep(1) episodic_return = 0 done = False steps = 0 while not done and steps <= config["max_episode_steps"]: steps += 1 act = eval_agent.act(obs) n_obs, reward, done, info = env.step(act) if render: env.render() sleep(1) episodic_return += reward obs = n_obs episodic_returns.append(episodic_return) mean_return = np.mean(episodic_returns) std_return = np.std(episodic_returns) negative_returns = sum([ret < 0 for ret in episodic_returns]) return mean_return, std_return, negative_returns
def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag whether mean evaluation performance should be printed :return (List[float], List[float], List[float], Dict[(Obs, Act)]): list of means and standard deviations of evaluation returns, list of epislons, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] evaluation_return_means = [] evaluation_return_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_return = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_return += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act if eps_num % config["eval_freq"] == 0: mean_return, std_return, negative_returns = evaluate( env, config, agent.q_table, render=RENDER, ) if output: print( f"EVALUATION: EP {eps_num} - MEAN RETURN {mean_return} +/- {std_return} ({negative_returns}/{config['eval_episodes']} failed episodes)" ) evaluation_return_means.append(mean_return) evaluation_return_stds.append(std_return) evaluation_epsilons.append(agent.epsilon) return evaluation_return_means, evaluation_return_stds, evaluation_epsilons, agent.q_table
def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag if mean evaluation results should be printed :return (float, List[float], List[float], Dict[(Obs, Act), float]): total reward over all episodes, list of means and standard deviations of evaluation rewards, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] total_reward = 0 evaluation_reward_means = [] evaluation_reward_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_reward = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_reward += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act total_reward += episodic_reward if eps_num > 0 and eps_num % config["eval_freq"] == 0: mean_reward, std_reward = evaluate(env, config, agent.q_table, eps_num, render=RENDER, output=output) evaluation_reward_means.append(mean_reward) evaluation_reward_stds.append(std_reward) evaluation_epsilons.append(agent.epsilon) return total_reward, evaluation_reward_means, evaluation_reward_stds, evaluation_epsilons, agent.q_table