class Game: def __init__(self): self.ball = Ball() self.paddle = Paddle() self.agent = QLearning(10, 0.7, 0.05) self.sarsa_agent = SARSA(10, 0.7, 0.05) self.state = (self.ball.x, self.ball.y, self.ball.velocity_x, self.ball.velocity_y, self.paddle.y) self.score = 0 self.reward = 0 self.game_number = 0 self.scores = [] self.finished_training = False self.finished_testing = False self.is_active = True self.previous_state = None self.previous_action = None self.training_stats = [] self.test_stats = [] def discretize_state(self): if self.is_active == False: return (-1,-1,-1,-1,-1) if self.ball.velocity_x > 0: discrete_velocity_x = 1 else: discrete_velocity_x = -1 if self.ball.velocity_y >= 0.015: discrete_velocity_y = 1 elif self.ball.velocity_y <= -0.015: discrete_velocity_y = -1 else: discrete_velocity_y = 0 discrete_paddle = min(11, int(math.floor(12 * self.paddle.y/(1 - self.paddle.height)))) discrete_ball_x = min(11, int(math.floor(12 * self.ball.x))) discrete_ball_y = min(11, int(math.floor(12 * self.ball.y))) return (discrete_ball_x, discrete_ball_y, discrete_velocity_x, discrete_velocity_y, discrete_paddle) def end_game(self): if len(self.scores) == 1000: self.scores = self.scores[1:] self.scores.append(self.score) self.score = 0 self.game_number += 1 self.is_active = False if self.game_number%1000 == 0: average = float(sum(self.scores))/1000.0 print(self.game_number, average) self.training_stats.append((self.game_number, average)) if self.game_number == 20000: self.finished_training = True def end_test_game(self): self.test_stats.append((self.game_number, self.score)) self.game_number += 1 self.score = 0 self.is_active = False if self.game_number == 200: self.finished_testing = True def check_terminal_state(self, mode): if self.ball.x > self.paddle.x: if self.ball.y > self.paddle.y and self.ball.y < self.paddle.y + self.paddle.height: self.ball.hit_paddle() self.score += 1 return True else: if mode == 'test': self.end_test_game() return False else: self.end_game() return False else: return False def update_q(self): hit_paddle = self.check_terminal_state('train') discrete_state = self.discretize_state() if self.is_active == False: self.reward = -1.0 if self.previous_state is not None: self.agent.learn(self.previous_state, self.previous_action, self.reward, discrete_state) self.previous_state = None self.ball = Ball() self.paddle = Paddle() self.is_active = True return if hit_paddle is True: self.reward = 1.0 if self.previous_state != None: self.agent.learn(self.previous_state, self.previous_action, self.reward, discrete_state) new_state = self.discretize_state() new_action = self.agent.choose_action(new_state) self.previous_state = new_state self.previous_action = new_action self.paddle.update(new_action) self.ball.update() self.reward = 0.0 def update_sarsa(self): hit_paddle = self.check_terminal_state('train') discrete_state = self.discretize_state() action = self.sarsa_agent.choose_action(discrete_state) if self.is_active == False: self.reward = -1.0 if self.previous_state is not None: self.sarsa_agent.learn(self.previous_state, self.previous_action, self.reward, discrete_state, action) self.previous_state = None self.ball = Ball() self.paddle = Paddle() self.is_active = True return if hit_paddle is True: self.reward = 1.0 if self.previous_state != None: self.sarsa_agent.learn(self.previous_state, self.previous_action, self.reward, discrete_state, action) new_state = self.discretize_state() new_action = self.sarsa_agent.choose_action(new_state) self.previous_state = new_state self.previous_action = new_action self.paddle.update(new_action) self.ball.update() self.reward = 0.0 def update_test_q(self): hit_paddle = self.check_terminal_state('test') discrete_state = self.discretize_state() if self.is_active == False: self.ball = Ball() self.paddle = Paddle() self.is_active = True return new_state = self.discretize_state() new_action = self.agent.choose_action(new_state) self.paddle.update(new_action) self.ball.update() def update_test_sarsa(self): hit_paddle = self.check_terminal_state('test') discrete_state = self.discretize_state() if self.is_active == False: self.ball = Ball() self.paddle = Paddle() self.is_active = True return new_state = self.discretize_state() new_action = self.sarsa_agent.choose_action(new_state) self.paddle.update(new_action) self.ball.update() def init_nagent(self, W, B, normalize): self.nagent = nnet_agent.NAgent(W, B, normalize) def update_test_nagent(self): hit_paddle = self.check_terminal_state('test') if self.is_active == False: self.ball = Ball() self.paddle = Paddle() self.is_active = True return new_state = (self.ball.x, self.ball.y, self.ball.velocity_x, self.ball.velocity_y, self.paddle.y) new_action = self.nagent.choose_action(new_state) # print(new_action) self.paddle.update(new_action) self.ball.update() self.state = (self.ball.x, self.ball.y, self.ball.velocity_x, self.ball.velocity_y, self.paddle.y)
# -*- coding: utf-8 -*- """ Created on Thu Jan 17 02:52:37 2019 @author: thoma """ from sarsa import SARSA import gym import matplotlib.pyplot as plt import numpy as np write_path = '../../data/data_long_sarsa.txt' T = 1000 nb_episodes = 500 env = gym.make('MountainCar-v0') agent = SARSA(env, T) lengths = -np.asarray(agent.learn(nb_episodes)) agent.generate_trajectory_file(200, write_path) plt.plot( np.arange(len(lengths))[::5], np.convolve(lengths, np.ones(5, ) / 5, mode='same')[::5]) plt.show()
def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag whether mean evaluation performance should be printed :return (List[float], List[float], List[float], Dict[(Obs, Act)]): list of means and standard deviations of evaluation returns, list of epislons, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] evaluation_return_means = [] evaluation_return_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_return = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_return += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act if eps_num % config["eval_freq"] == 0: mean_return, std_return, negative_returns = evaluate( env, config, agent.q_table, render=RENDER, ) if output: print( f"EVALUATION: EP {eps_num} - MEAN RETURN {mean_return} +/- {std_return} ({negative_returns}/{config['eval_episodes']} failed episodes)" ) evaluation_return_means.append(mean_return) evaluation_return_stds.append(std_return) evaluation_epsilons.append(agent.epsilon) return evaluation_return_means, evaluation_return_stds, evaluation_epsilons, agent.q_table
def train(env, config, output=True): """ Train and evaluate SARSA on given environment with provided hyperparameters :param env (gym.Env): environment to execute evaluation on :param config (Dict[str, float]): configuration dictionary containing hyperparameters :param output (bool): flag if mean evaluation results should be printed :return (float, List[float], List[float], Dict[(Obs, Act), float]): total reward over all episodes, list of means and standard deviations of evaluation rewards, final Q-table """ agent = SARSA( num_acts=env.action_space.n, gamma=config["gamma"], epsilon=config["epsilon"], alpha=config["alpha"], ) step_counter = 0 # 100 as estimate of max steps to take in an episode max_steps = config["total_eps"] * config["max_episode_steps"] total_reward = 0 evaluation_reward_means = [] evaluation_reward_stds = [] evaluation_epsilons = [] for eps_num in range(config["total_eps"]): obs = env.reset() episodic_reward = 0 steps = 0 done = False # take first action act = agent.act(obs) while not done and steps < config["max_episode_steps"]: n_obs, reward, done, info = env.step(act) step_counter += 1 episodic_reward += reward agent.schedule_hyperparameters(step_counter, max_steps) n_act = agent.act(n_obs) agent.learn(obs, act, reward, n_obs, n_act, done) obs = n_obs act = n_act total_reward += episodic_reward if eps_num > 0 and eps_num % config["eval_freq"] == 0: mean_reward, std_reward = evaluate(env, config, agent.q_table, eps_num, render=RENDER, output=output) evaluation_reward_means.append(mean_reward) evaluation_reward_stds.append(std_reward) evaluation_epsilons.append(agent.epsilon) return total_reward, evaluation_reward_means, evaluation_reward_stds, evaluation_epsilons, agent.q_table