def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag whether mean evaluation performance should be printed :return (List[float], List[float], List[float], Dict[(Obs, Act)]): list of means and standard deviations of evaluation returns, list of epislons, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] evaluation_return_means = [] evaluation_return_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_return = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_return += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act if eps_num % config["eval_freq"] == 0: mean_return, std_return, negative_returns = evaluate( env, config, agent.q_table, render=RENDER, ) if output: print( f"EVALUATION: EP {eps_num} - MEAN RETURN {mean_return} +/- {std_return} ({negative_returns}/{config['eval_episodes']} failed episodes)" ) evaluation_return_means.append(mean_return) evaluation_return_stds.append(std_return) evaluation_epsilons.append(agent.epsilon) return evaluation_return_means, evaluation_return_stds, evaluation_epsilons, agent.q_table
def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag if mean evaluation results should be printed :return (float, List[float], List[float], Dict[(Obs, Act), float]): total reward over all episodes, list of means and standard deviations of evaluation rewards, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] total_reward = 0 evaluation_reward_means = [] evaluation_reward_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_reward = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_reward += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act total_reward += episodic_reward if eps_num > 0 and eps_num % config["eval_freq"] == 0: mean_reward, std_reward = evaluate(env, config, agent.q_table, eps_num, render=RENDER, output=output) evaluation_reward_means.append(mean_reward) evaluation_reward_stds.append(std_reward) evaluation_epsilons.append(agent.epsilon) return total_reward, evaluation_reward_means, evaluation_reward_stds, evaluation_epsilons, agent.q_table