def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate = 0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99): all_rewards = [] all_goods = [] all_losses = [] all_q_tables = [] trained_agent_q_values = [left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma)] for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) q_table = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length/10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good q_table.append([q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = (1 - q_learning.learning_rate) * q_learning.right_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = (1 - q_learning.learning_rate) * q_learning.left_mean + q_learning.learning_rate * (reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) all_q_tables.append(q_table) return all_q_tables, all_rewards, all_goods, np.asarray(all_losses)
if __name__ == "__main__": env = gym.make("FrozenLake-v0") agent = QLearning(env, epsilon=0.8, gamma=0.5, lr=0.01) episode_rew = [] for episode in range(EPISODES): # Deciding first action action = env.action_space.sample() state = env.reset() ep_rew = 0 while True: next_state, reward, done, _ = env.step(action) # env.render() ep_rew += reward agent.update((state, action, reward, next_state)) state = next_state agent.get_action(state) if done: episode_rew.append(ep_rew) break env.close() plt.plot(episode_rew) plt.show() state = env.reset() while True: action = agent.get_action(state, explore=False) next_state, reward, done, _ = env.step(action)
class Game: def __init__(self, total_episodes: int): self.window_width = constant.WIDTH * constant.TILE self.window_height = constant.HEIGHT * constant.TILE self._running = True self._display = None self._snake = None self._mouse = None self.episode = 1 self.total_episodes = total_episodes self.score = 0 self.max_score = 0 self.frames = 0 self.game_stats = [] self.specs = [] self.test_run = False self.snake = Snake() self.mouse = Mouse(constant.WIDTH, constant.HEIGHT, self.snake.body_position()) self.q = QLearning() def initialize_pygame(self): """ Initialize pygame along with display and image settings """ pygame.init() self._display = pygame.display.set_mode( (self.window_width, self.window_height), pygame.HWSURFACE) pygame.display.set_caption('SNAKE ' + 'Episode ' + str(self.episode)) self._snake = pygame.image.load("img/snake_body_mini.png").convert() # source for mouse: http://pixelartmaker.com/art/3d272b1bf180b60.png self._mouse = pygame.image.load("img/mouse_mini.png").convert() def game_over(self, collision_type: str): """ Print game results and exit the game """ collision_value = -1 # represents body collision if collision_type == 'the wall': collision_value = 1 self.snake.update_tail() self._running = False if self.score > self.max_score: self.max_score = self.score self.game_stats.append([self.frames, self.score, collision_value]) self.display(collision_type) self.next_episode() def display(self, collision_type: str): """ Displays game over status and scores, and can call display/save data functions :param collision_type: what type of collision ended the game """ if self.episode % constant.SAVE_EPISODE == 0: self.q.save_table(self.episode, clear_dir=constant.DELETE_JSON) print(f'GAME OVER! Snake collided with {collision_type}') print(f'SCORE: {self.score}') def move_snake(self, ai_play: bool): """ Check whether the snake has eaten the mouse or encountered a collision :param ai_play: True if ai play, False otherwise """ self.snake.update_head() # if snake eats mouse if self.snake.eats_mouse(self.mouse.x, self.mouse.y): self.mouse.generate_mouse(self.snake.body_position()) self.score += 1 if ai_play: self.q.update_reward('mouse') # if snake collides with itself elif self.snake.body_collision(): if ai_play: self.q.update_reward('snake') self.game_over('itself') # if snake collides with walls elif self.snake.wall_collision(0, self.window_width, 0, self.window_height): if ai_play: self.q.update_reward('wall') self.game_over('the wall') else: if ai_play: self.q.update_reward('empty') self.snake.update_tail() def abs_coordinates(self): snake_head = self.snake.head_coordinates() mouse_loc = self.mouse.relative_coordinates(snake_head) tail_loc = self.snake.tail_coordinates() return tail_loc, mouse_loc def render(self): """ Render the visual components of the game """ self._display.fill((0, 0, 0)) self.snake.draw(self._display, self._snake) self.mouse.draw(self._display, self._mouse) pygame.display.flip() def human_play(self, delay: int): """ Executes the game play, snake movements, and loops until the game ends. Keys can be used to play the game. :param delay: defines the frame delay with lower values (e.g. 1) resulting in a fast frame, while higher values (e.g. 1000) result in very slow frames """ while self._running: pygame.event.pump() keys = pygame.key.get_pressed() if keys[pygame.K_RIGHT]: self.snake.set_east() elif keys[pygame.K_LEFT]: self.snake.set_west() elif keys[pygame.K_UP]: self.snake.set_north() elif keys[pygame.K_DOWN]: self.snake.set_south() elif keys[pygame.K_ESCAPE]: self._running = False self.move_snake(False) self.render() sleep(float(delay) / 1000) self.frames += 1 def set_direction(self, direction: str): """ Sets the direction for the snake to take :param direction: specified direction """ if direction == 'east': self.snake.set_east() elif direction == 'west': self.snake.set_west() elif direction == 'north': self.snake.set_north() else: # south self.snake.set_south() def ai_train(self, delay: int, resume_state: bool): """ Executes the AI training, looping until the snake is trained the total number of episodes. Movements are implemented by the AI rather than by a human pressing keys. :param delay: defines the frame delay with lower values (e.g. 1) resulting in a fast frame, while higher values (e.g. 1000) result in very slow frames :param resume_state: if True, start training from externally saved table's next episode, if False, initial episode is 1 """ # If resuming from a saved state, start from the loaded state's next episode if resume_state: self.resume_game(self.total_episodes) while self._running: pygame.event.pump() tail_loc, mouse_loc = self.abs_coordinates() snake_direction = self.snake.current_direction() state = self.q.define_state(tail_loc, mouse_loc, snake_direction) action = self.q.select_action(state) self.set_direction(action) self.move_snake(True) tail_loc, mouse_loc = self.abs_coordinates() snake_direction = self.snake.current_direction() next_state = self.q.define_state(tail_loc, mouse_loc, snake_direction) self.q.update(state, next_state, action) self.q.reset_reward() self.render() sleep(float(delay) / 1000) self.frames += 1 def ai_test(self, delay: int, resume_state: bool): """ Tests the AI on previous training data :param delay: defines the frame delay :param resume_state: if True, start training from externally saved table's next episode, if False, initial episode is 1 """ self.test_run = True self.episode = 1 # If resuming from a saved state, start from the loaded state's next episode if resume_state: self.resume_game(constant.TOTAL_TESTS) if constant.PARAM_TEST: self.total_episodes = constant.TOTAL_TESTS # Run the total number of tests specified while self.episode <= self.total_episodes: caption = 'SNAKE ' + 'FINAL TEST RUN: EPISODE ' + str(self.episode) self.reset_game(caption) self.game_stats = [] self.specs = [] while self._running: pygame.event.pump() tail_loc, mouse_loc = self.abs_coordinates() snake_direction = self.snake.current_direction() state = self.q.define_state(tail_loc, mouse_loc, snake_direction) action = self.q.select_action(state) self.set_direction(action) self.move_snake(True) self.render() sleep(float(delay) / 1000) self.frames += 1 print( f'(TEST RUN EPISODE {str(self.episode)}) FINAL SCORE: {self.score}, FINAL MAX SCORE: {self.max_score}\n' ) self.episode += 1 def resume_game(self, total_tests): filename = 'episode' + str(constant.RESUME_EPISODE) + '.json' self.episode = self.q.load_table(filename) if self.episode < 1: print(f'Table failed to load') self.total_episodes = self.episode + total_tests - 1 def reset_game(self, caption: str): pygame.display.set_caption(caption) self.score = 0 self.frames = 0 self._running = True self.snake.initialize_positions(self.mouse.x, self.mouse.y) self.mouse.generate_mouse(self.snake.body_position()) def next_episode(self): """ Sets-up the next episode or completes the final episode """ if self.episode >= self.total_episodes: self.prep_data() return # set new episode self.episode += 1 print(f'\nNEW GAME, EPISODE {self.episode}') caption = 'SNAKE ' + 'Episode ' + str(self.episode) self.reset_game(caption) def prep_data(self): """ Prepares data formatting with headers, specific test names, etc """ self.specs = [] filename = '' if self.test_run: filename = 'testing_' + constant.PARAM + str(constant.PARAM_VAL) if constant.PARAM_TEST: filename += constant.PARAM + str(constant.PARAM_VAL) stats_file = filename + '_data.csv' header = ['Steps', 'Scores', 'Collisions'] self.write_data(stats_file, header, self.game_stats) specs_file = filename + '_specs.csv' header = ['Parameters', 'Values'] self.specs.append(['total episodes', self.episode]) self.specs.append(['height', constant.HEIGHT]) self.specs.append(['width', constant.WIDTH]) self.specs.append(['learning rate', constant.ETA]) self.specs.append(['discount', constant.DISCOUNT]) self.specs.append(['epsilon', constant.EPSILON]) self.specs.append(['mouse reward', constant.MOUSE]) self.specs.append(['wall penalty', constant.WALL]) self.specs.append(['self-collision penalty', constant.SNAKE]) self.specs.append(['empty tile penalty', constant.EMPTY]) self.write_data(specs_file, header, self.specs, True) def write_data(self, filename: str, header: [str], data: [], add_specs: bool = False): """ Writes the data from the current session to a file. :param filename: filename to write data :param header: header names for data :param data: data to add to file :param add_specs: True if writing specs file, False otherwise """ op = 'w' # default write to CSV path = constant.DATA_DIR file = path + filename # create directory if it doesn't exist if not os.path.exists(path): os.mkdir(path) # append data to existing file if constant.RESUME and os.path.isfile(file): op = 'a' # write specs if add_specs: op = 'w' # write data to csv file(s) with open(file, op, newline='') as outfile: w = csv.writer(outfile) if not constant.RESUME: w.writerow(header) if not constant.PARAM_TEST and constant.RESUME: w.writerow(header) w.writerows(data) outfile.close()
def run_games(game_length, left_arm_mean, left_arm_std, n_players, right_arm_mean, right_arm_std, use_asrn, learning_rate=0.01, gamma=0.95, epsilon=1.0, epsilon_decay=0.99, debug=False, random_init=False): all_rewards = [] all_goods = [] all_losses = [] trained_agent_q_values = [ left_arm_mean / (1 - gamma), right_arm_mean / (1 - gamma) ] mx = np.max(trained_agent_q_values) mn = np.min(trained_agent_q_values) avg = 0 std = mx - mn for j in range(n_players): two_armed_bandit = BrokenArmedBandit(left_arm_mean=left_arm_mean, right_arm_mean=right_arm_mean, left_arm_std=left_arm_std, right_arm_std=right_arm_std) if random_init: left_initial_mean = np.random.normal(avg, std) right_initial_mean = np.random.normal(avg, std) if left_initial_mean < right_initial_mean: left_initial_mean = -1 right_initial_mean = 1 else: left_initial_mean = 1 right_initial_mean = -1 else: ## giving the real mean as initialization(!) left_initial_mean = trained_agent_q_values[0] right_initial_mean = trained_agent_q_values[1] q_learning = QLearning(left_initial_mean, right_initial_mean, learning_rate, gamma, epsilon, epsilon_decay) rewards = np.zeros((game_length, 1)) goods = np.zeros((game_length, 1)) losses = np.zeros((game_length, 1)) debug_data = [] if use_asrn: asrn = BinsASRN(0, learning_period=game_length / 10) for i in range(game_length): right, reward_estimation = q_learning.choose() good = q_learning.right_mean > q_learning.left_mean goods[i] = good if debug: debug_data.append( [right, q_learning.right_mean, q_learning.left_mean]) reward = two_armed_bandit.pull(right) rewards[i] = reward if use_asrn: if right: updated_right_mean = ( 1 - q_learning.learning_rate ) * q_learning.right_mean + q_learning.learning_rate * ( reward + q_learning.gamma * q_learning.right_mean) reward = asrn.noise(q_learning.right_mean, updated_right_mean, reward) else: updated_left_mean = ( 1 - q_learning.learning_rate ) * q_learning.left_mean + q_learning.learning_rate * ( reward + q_learning.gamma * q_learning.left_mean) reward = asrn.noise(q_learning.left_mean, updated_left_mean, reward) loss = q_learning.update(right, reward) losses[i] = loss all_rewards.append(rewards) all_goods.append(goods) all_losses.append(losses) if debug: debug_data = np.asarray(debug_data)[:, 1:] plt.plot(debug_data[:, 0], '-g') plt.plot(debug_data[:, 1], '-r') plt.legend(['Q r', 'Q l']) plt.show() return np.asarray(all_rewards), np.asarray(all_goods), np.asarray( all_losses)