class Env: def __init__(self): # initializing the instance of FlappyBird class self.game = FlappyBird(pipe_gap=100) # then pass this object into PLE constructor and create an instance of that self.env = PLE(self.game, fps=30, display_screen=False) # init does some necessary things under the hood self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary self.action_map = self.env.getActionSet() # function which takes an action def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def main(argv): try: opts, _ = getopt.getopt(argv, "hr") except getopt.GetoptError: print("birdML.py [-h | -r]") sys.exit(2) record = False for opt, arg in opts: if opt == '-h': print("-h to help") print("-r record") elif opt == '-r': record = True netb = netBrain() netb.summary() game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() actions = p.getActionSet() out = 1 epochs = 50 for i in range(epochs): lstates = [] rewards = [] if record: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc, 30.0, (288, 512)) for d in range(10): while not p.game_over(): if record: obs = p.getScreenRGB() obs = cv2.transpose(obs) obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR) out.write(obs) st = game.getGameState() gstate = list(st.values()) gstate = np.array([np.array(gstate)]) lstates.append(gstate[0]) pred = netb.predict(gstate)[0] a = pred.argmax() p.act(actions[a]) if st['next_pipe_bottom_y'] < st['player_y']: pred[0] = 1.0 pred[1] = 0.0 elif st['next_pipe_top_y'] > st['player_y']: pred[0] = 0.0 pred[1] = 1.0 rewards.append(pred) p.reset_game() netb.fit(np.array(lstates), np.array(rewards), batch_size=10, epochs=10) if record: out.release()
def main_test(): final_score = 0 previous_action = 1 model = build_neural_network_model() game = FlappyBird(width=288, height=512, pipe_gap=100) env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state) model = load_model("model.h5") env.init() passed = 0 old_y = 0 for i in range(game_steps): if i == game_steps - 1: print("Score: {}".format(final_score)) if env.game_over(): print("Final Score: {}".format(final_score)) time.sleep(1) final_score = 0 env.reset_game() observation = env.getGameState() vector = model.predict(np.matrix(list(observation[0].values()))) a_star = np.argmax(vector[0]) print(vector[0][0], vector[0][1], a_star) time.sleep(0.05) env_reward = env.act(env.getActionSet()[a_star]) if env_reward == 1: final_score += 1
def play(self, n=1, file_path=None): # use "Fancy" for full background, random bird color and random pipe color, # use "Fixed" (default) for black background and constant bird and pipe colors. game = FlappyBird(graphics="fixed") # Note: if you want to see you agent act in real time, set force_fps to False. # But don't use this setting for learning, just for display purposes. env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Init the environment (settings, display...) env.init() # Load the model model = load_model(file_path) # Let's play n games, and see if the model is correctly trained for _ in range(n): env.reset_game() while not env.game_over(): S = self.get_game_data(game) Q = model.predict(S, batch_size=1) A = np.argmax(Q[0]) env.act(self.ACTIONS[A])
def run_a_game(game): from ple import PLE p = PLE(game, display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() for i in range(NUM_STEPS): p.act(agent.pick_action())
def evaluate(agent): env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): env.init() env.reset_game() obs = list(env.getGameState().values()) episode_reward = 0 while True: action = agent.predict(obs) observation = env.getScreenRGB() score = env.score() #action = agent.pickAction(reward, observation) observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) cv2.imshow("ss", observation) cv2.waitKey(10) # 预测动作,只选最优动作 reward = env.act(actionset[action]) obs = list(env.getGameState().values()) done = env.game_over() episode_reward += reward if done: break eval_reward.append(episode_reward) cv2.destroyAllWindows() return np.mean(eval_reward)
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=110) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() # [None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def _test_ple(): from ple.games.pong import Pong from ple.games.flappybird import FlappyBird from ple import PLE # os.environ['SDL_VIDEODsRIVER'] = 'dummy' game = Pong() game = FlappyBird() ple_game = PLE(game, fps=30, display_screen=True) ple_game.init() ALLOWED_ACTIONS = ple_game.getActionSet() print(ALLOWED_ACTIONS) action = 0 start = time() t = 0 while True: ep_reward = 0 ple_game.reset_game() while not ple_game.game_over(): sleep(0.1) t += 1 if t % 15 == 5: action = 0 else: action = 1 reward = ple_game.act(ALLOWED_ACTIONS[action]) # print(reward) ep_reward += reward print(ep_reward, t, t / (time() - start))
class PLEEnv(Env): def __init__(self, game, _id, render=True, reset_done=True, num_steps=100): super().__init__(_id, render, reset_done) self.num_steps = num_steps self.game = game self.start() def start(self): if not self.env_instance: self.env_instance = PLE(self.game, fps=30, display_screen=self.render) self.env_instance.init() def step(self, action): reward = self.env_instance.act(action) obs = self.env_instance.getGameState() done = self.env_instance.game_over() return obs, reward, done def reset(self): self.env_instance.reset_game() obs = self.env_instance.getGameState() return obs def close(self): pass def restart(self): self.close() self.reset()
class SnakeEnv(object): def __init__(self): self.game = Snake() self.p = PLE(self.game, fps=30, display_screen=True) # self.actions = self.p.getActionSet() # self._action_space = list(range(self.actions[0])) # self._action_space.append(self.actions[-1]) self.action_space = self.p.getActionSet() def reset(self): self.p.init() self.p.act(None) return self.p.getScreenRGB() # return self.p.getScreenGrayscale() def step(self, action): reward = self.p.act(self.action_space[action]) # reward = self.p.act(119) # print(self.action_space[action], reward) return self.p.getScreenRGB(), reward, self.p.game_over() # return self.p.getScreenGrayscale(), reward, self.p.game_over() @property def action_space(self): return self._action_space @action_space.setter def action_space(self, action_space): self._action_space = action_space
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() score = 0 tot_nb_episodes = nb_episodes average = 0 highscore = 0 over_50_count = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead state, ignore = agent.state_binner(env.game.getGameState()) action = agent.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over() or score >= 60: average += score if score > highscore: highscore = score if score >= 50: over_50_count += 1 print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("Average for {} runs {:.2f}".format(tot_nb_episodes, average / tot_nb_episodes)) over_50_p = (over_50_count / tot_nb_episodes) * 100 print("The percentage of scores over 50 is: %d" % (over_50_p)) return over_50_p
class PleEnvAdapter(EnvAdapter): """Pygame learning env adapter""" def __init__(self, *args, **kwargs): super(PleEnvAdapter, self).__init__(*args, **kwargs) if not self.render: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" Game = envs_lookup_table[self.env_name] self.env = PLE(Game(), display_screen=self.render, force_fps=not self.render) self.env.init() def get_input_shape(self): return (len(self.env.getGameState()), ) def reset(self): self.env.reset_game() def step(self, action) -> (object, float, bool): reward = self.env.act(self.env.getActionSet()[action]) observation = self.env.getGameState() observation = [val for key, val in observation.items()] done = self.env.game_over() return observation, reward, done def get_n_actions(self) -> int: return len(self.env.getActionSet()) def get_random_action(self): return random.randint(0, len(self.env.getActionSet()) - 1)
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def main(): render_bool = True if not render_bool: os.environ["SDL_VIDEODRIVER"] = "dummy" # else: # pygame.display.set_mode((800, 600 + 60)) # 创建环境 game = GameEnv() p = PLE(game, display_screen=render_bool, fps=60, force_fps=False ) # , fps=30, display_screen=render_bool, force_fps=True) p.init() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) width, height = p.getScreenDims() rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 obs_dim = 1, width, height model = Model(act_dim=act_dim) alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.5, e_greed_decrement=0.00001) # e_greed有一定概率随机选取动作,探索 # 加载模型 best_eval_reward = -1000 if os.path.exists('./model_dqn.ckpt'): print("loaded model:", './model_dqn.ckpt') agent.restore('./model_dqn.ckpt') best_eval_reward = evaluate(p, agent, render=render_bool) # run_episode(env, agent, train_or_test='test', render=True) # exit() # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 5): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=render_bool) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, agent.e_greed, eval_reward)) # 保存模型到文件 ./model.ckpt agent.save('./model_dqn_%d.ckpt' % rate_num) if best_eval_reward < eval_reward: best_eval_reward = eval_reward agent.save('./model_dqn.ckpt')
def train(nb_frames, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 avrage = [] count = [] nb_episodes = 0 number_of_frames = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) avrage.append(avg_score / 100) count.append(number_of_frames) avg_score = 0 #print("score for this episode: %d" % score) agent.calculate() env.reset_game() score = 0 print(biggest_score) data = {"Count": count, "Avrage": avrage} df = pd.DataFrame(data) sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)
def train(nb_episodes, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -50000 avg_score = 0 episodes = 0 to_break = False while nb_episodes > 0: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) agent.frames += 1 score += reward if ((agent.frames % 10000) == 0): to_break = True # reset the environment if the game is over if env.game_over(): avg_score += score if score > biggest_score: biggest_score = score if biggest_score > 450: break print(biggest_score) print(nb_episodes) if nb_episodes % 100 == 0: print(avg_score / 100) if avg_score / 100 >= 5: break avg_score = 0 if to_break: break #print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 return biggest_score
def test_agent(policy, file_writer=None, test_games=10, step=0): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env.init() test_rewards = [] for _ in range(test_games): env.reset_game() no_op(env) game_rew = 0 while not env.game_over(): state = flappy_game_state(env) action = 119 if policy(state) == 1 else None for _ in range(2): game_rew += env.act(action) test_rewards.append(game_rew) if file_writer is not None: summary = tf.Summary() summary.value.add(tag='test_performance', simple_value=game_rew) file_writer.add_summary(summary, step) file_writer.flush() return test_rewards
def train(nb_frames, agent, a, g, results): print("alpha %f" % a) print("gamma %f" % g) reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 number_of_frames = 0 nb_episodes = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) results[0].append(avg_score / 100) results[1].append(number_of_frames) results[2].append(a) results[3].append(g) avg_score = 0 #print("score for this episode: %d" % score) env.reset_game() score = 0 print(biggest_score) return results
def score(self, training=True, nb_episodes=10): reward_values = { 'positive': 1.0, 'negative': 0.0, 'tick': 0.0, 'loss': 0.0, 'win': 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() total_episodes = nb_episodes score = 0 scores = [] while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over() or score >= 100: scores.append(score) env.reset_game() nb_episodes -= 1 score = 0 avg_score = sum(scores) / float(len(scores)) print('Games played: {}'.format(total_episodes)) print('Average score: {}'.format(avg_score)) if training: score_file = '{}/scores.csv'.format(self.name) # If file doesn't exist, add the header if not os.path.isfile(score_file): with open(score_file, 'a') as f: f.write('avg_score,episode_count,num_of_frames,min,max\n') # Append scores to the file with open(score_file, 'a') as f: f.write('{},{},{},{},{}\n'.format(avg_score, self.num_of_episodes, self.num_of_frames, min(scores), max(scores))) else: with open('scores.txt', 'a') as f: for score in scores: f.write('{},{}\n'.format(self.name, score))
def set_maze_game_setup(self, game): ''' @game : game instance ''' p = PLE(game, display_screen=True) self.actions = p.getActionSet() p.init() return p
def run_a_game(self,game): from ple import PLE p = PLE(game,display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward,obs))
def run_a_game(self, game): from ple import PLE p = PLE(game, display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward, obs))
def test_movement_up(): game = Pong() p = PLE(game, display_screen=True, fps=20, force_fps=1) p.init() time.sleep(.5) oldState = p.getGameState() p.act(game.actions["up"]) newState = p.getGameState() assert oldState["player_velocity"] > newState["player_velocity"]
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load('model95000') batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) # remember and replay agent.remember(state, action, reward, next_state, p.game_over()) if len(agent.memory) > batch_size: agent.replay(batch_size) state = next_state # save Model if i % 5000 == 0: print("Updating weights") agent.save('newmodel' + str(i)) agent.target_model.set_weights(agent.model.get_weights()) # Plot socre if i % 1000 == 0: plot(data)
class GameEnv(object): def __init__(self, display_screen): self.width = IMAGE_WIDTH self.height = IMAGE_HEIGHT self.count = 0 self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen) self.p.init() self._update_state() self.score = 0 def pre_process_image(self, image): self.count += 1 image = color.rgb2gray(image) image = transform.resize(image, (self.width, self.height)) image = exposure.rescale_intensity(image, out_range=(0, 255)) image = image.astype('float') image = image / 255.0 return image.reshape(1, self.width, self.height, 1) def _update_state(self): image = self.p.getScreenRGB() # TODO: convert to float image = self.pre_process_image(image) state = getattr(self, 'state', None) if state is None: self.state = np.concatenate([image] * 4, axis=3) else: self.state[:, :, :, :3] = image def get_state(self): return self.state def step(self, action): if action == 1: _ = self.p.act(119) else: _ = self.p.act(None) self._update_state() done = False if self.p.game_over(): done = True self.p.reset_game() reward = -1 else: reward = 0.1 return_score = self.score + reward self.score = 0 if done else self.score + reward return self.state, reward, done, return_score def get_score(self): return self.score
class PLEEnv(gym.Env): def __init__(self, env_config): game = Catcher(width=screen_wh, height=screen_wh) fps = 30 # fps we want to run at frame_skip = 2 num_steps = 2 force_fps = False # False for slower speed display_screen = True # make a PLE instance. self.env = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) self.env.init() self.action_dict = {0: None, 1: 97, 2: 100} #PLE env starts with black screen self.env.act(self.env.NOOP) self.action_space = Discrete(3) self.k = 4 self.observation_space = spaces.Box(low=0, high=255, shape=(screen_wh, screen_wh, 1 * self.k)) self.frames = deque([], maxlen=self.k) def reset(self): self.env.reset_game() # PLE env starts with black screen, NOOP step to get initial screen self.env.act(self.env.NOOP) ob = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) for _ in range(self.k): self.frames.append(ob) return self._get_ob() def step(self, action): #traditional gym env step #_obs, _rew, done, _info = env.step(env.action_space.sample()) action_value = self.action_dict[action] _rew = self.env.act(action_value) #_obs = self.env.getScreenGrayscale() _obs = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) self.frames.append(_obs) _done = self.env.game_over() _info = {} return self._get_ob(), _rew, _done, _info def _get_ob(self): assert len(self.frames) == self.k return np.concatenate(self.frames, axis=2)
def show_playing(episodes, agent): env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() return run_game(episodes, agent, env, False)
def set_maze_game_setup(self, game): ''' @game : game instance ''' p = PLE(game, display_screen=False) #In some games, doing nothing is a valid action #in a maze, it is not self.actions = p.getActionSet()[:-1] p.init() return p
def train(self): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ if not os.path.exists(self.name): os.mkdir(self.name) t = threading.Thread(target=self.draw_plots) t.daemon = True t.start() reward_values = self.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 while self.frame_count <= 1000000: # pick an action state1 = env.game.getGameState() action = self.training_policy(state1) # step the environment reward = env.act(env.getActionSet()[action]) # print("reward=%d" % reward) state2 = env.game.getGameState() end = env.game_over( ) or score >= 100 # Stop after reaching 100 pipes self.observe(state1, action, reward, state2, end) # reset the environment if the game is over if end: env.reset_game() score = 0 if self.frame_count % 25000 == 0: print("==========================") print("episodes done: {}".format(self.episode_count)) print("frames done: {}".format(self.frame_count)) self.score() with open("{}/agent.pkl".format(self.name), "wb") as f: pickle.dump((self), f, pickle.HIGHEST_PROTOCOL) print("==========================")
def test(): game = Snake(600, 600) p = PLE(game, fps=60, state_preprocessor=process_state, force_fps=True, display_screen=True, frame_skip=2, reward_values={ "positive": 100.0, "negative": -50.0, "tick": -0.1, "loss": -70.0, "win": 5.0 }) agent = Agent(alpha=float(sys.argv[1]), gamma=float(sys.argv[2]), n_actions=3, epsilon=0.01, batch_size=100, input_shape=6, epsilon_dec=0.99999, epsilon_end=0.001, memory_size=500000, file_name=sys.argv[3], activations=[str(sys.argv[4]), str(sys.argv[5])]) p.init() agent.load_game() scores = [] for _ in range(200): if p.game_over(): p.reset_game() apples = 0 initial_direction = "Right" while not p.game_over(): old_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) action = agent.choose_action(old_state) possible_directions = prepare_corect_directions(initial_direction) possible_directions_tuples = list( zip(possible_directions.keys(), possible_directions.values())) direction = possible_directions_tuples[action] initial_direction = direction[1] reward = p.act(direction[0]) if reward > 50.0: apples += reward scores.append(apples) return scores
class Game(gym.Env): def __init__(self, display_screen=False, force_fps=True): os.environ["SDL_VIDEODRIVER"] = "dummy" game = FlappyBird() # define and initiate the environment self.env = PLE(game, fps=30, display_screen=display_screen, force_fps=force_fps) self.env.init() # list of actions in the environment self.actions = self.env.getActionSet() # length of actions self.action_space = spaces.Discrete(len(self.actions)) def step(self, action): """Take the action chosen and update the reward""" reward = self.env.act(self.actions[action]) state = self.getGameState() terminal = self.env.game_over() # If the bird is stuck, the game is over and a reward of -1000 # if it continues, +1 if terminal: reward = -1000 else: reward = 1 return state, reward, terminal, {} def getGameState(self): ''' PLEenv return gamestate as a dictionary. Returns a modified form of the gamestate only with the required information to define the state ''' state = self.env.getGameState() h_dist = state['next_pipe_dist_to_player'] v_dist = state['next_pipe_bottom_y'] - state['player_y'] vel = state['player_vel'] return ' '.join([str(vel), str(h_dist), str(v_dist)]) def reset(self): """Resets the game to start a new game""" self.env.reset_game() state = self.env.getGameState() return state def seed(self, seed): rng = np.random.RandomState(seed) self.env.rng = rng self.env.game.rng = self.env.rng self.env.init()
def main(train=False): # Don't modify anything in this function. # See the constants defined at the top of this file if you'd like to # change the FPS, screen size, or round length game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE) if train: p = PLE(game, fps=FPS, display_screen=False, force_fps=True) else: p = PLE(game, fps=FPS, display_screen=True, force_fps=False) p.init() agent_rounds = 0 cpu_rounds = 0 agent_score = 0 cpu_score = 0 num_frames = 0 while True: if p.game_over(): if game.score_counts['agent'] > game.score_counts['cpu']: agent_rounds += 1 print('AGENT won round') else: cpu_rounds += 1 print('CPU won round') if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS: break p.reset_game() obs = p.getGameState() action = agent(normalize(obs)) reward = p.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 print('AGENT scored') elif reward < 0: cpu_score += 1 print('CPU scored') num_frames += 1 winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU' print('Winner:', winner) print('Num frames :', num_frames) print('AGENT rounds won:',agent_rounds) print('CPU rounds won:',cpu_rounds) print('AGENT total score:',agent_score) print('CPU total score:',cpu_score)
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.00) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) rewards = {} try: module = importlib.import_module("ple.games.%s" % parameters.game.lower()) game = getattr(module, parameters.game) if parameters.game == "FlappyBird": game = game() elif parameters.game == "WaterWorld": game = game(width=84, height=84, num_creeps=6) else: game = game(width=84, height=84) except: raise ValueError("The game %s could not be found. Try using the classname, it is case sensitive." % parameters.game) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' env = PLE( game, fps=60, force_fps=parameters.force_fps, display_screen=parameters.display_screen, reward_values=rewards, rng=rng ) num_actions = len(env.getActionSet()) if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ple_agent.NeuralAgent(network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) experiment = ple_experiment.PLEExperiment(env, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) env.init() experiment.run()
class NaiveAgent(): """ This is our naive agent. It picks actions at random! """ def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False if f > 2250:
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000): # training parameters num_epochs = 5 num_steps_train_epoch = num_steps_train_total/num_epochs # steps per epoch of training num_steps_test = 100 update_frequency = 10 # step frequency of model training/updates epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 1000 # decay steps epsilon_min = 0.1 epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # memory settings max_memory_size = 10000 min_memory_size = 60 # number needed before model training starts game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = init_agent(env) memory = utils.ReplayMemory(max_memory_size, min_memory_size) env.init() # Logging configuration and figure plotting logging.basicConfig(filename='../learning.log', filemode='w', level=logging.DEBUG, format='%(levelname)s:%(message)s') logging.info('========================================================') logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n') learning_rewards = [0] testing_rewards = [0] for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train_epoch: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_train_epoch: state = env.getGameState() reward, action = my_agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(my_agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if steps < num_steps_train_epoch: learning_rewards.append(episode_reward) if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)) steps, num_episodes = 0, 0 losses, rewards = [], [] # testing loop while steps < num_steps_test: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_test: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward testing_rewards.append(testing_rewards[-1]+reward) steps += 1 # done watching after 500 steps. if steps > 500: env.display_screen = False if num_episodes % 5 == 0: logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) if steps < num_steps_test: testing_rewards.append(episode_reward) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)) logging.info("Training complete.\n\n") plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total) plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total) save_agent(my_agent, agent_file_path, agent_file_name)
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''