class WaterWorld: def __init__(self, fps=30, display_screen=False): game = PyGameWaterWorld() self.game = PLE(game, fps=fps, display_screen=display_screen) action_set = self.game.getActionSet() self.action_map = {i: a for (i, a) in enumerate(action_set)} self.action_space = spaces.Discrete(len(self.action_map)) self.metadata = {'render.modes': ['human', 'rgb_array']} box = np.ones((48, 48, 3), dtype='float32') self.observation_space = spaces.Box(low=box * 0, high=box * 255) def reset(self): self.game.reset_game() return self.game.getScreenRGB() def step(self, action): a = self.action_map[action] r = self.game.act(a) done = self.game.game_over() info = {} return self.game.getScreenRGB(), r, done, info def close(self): pass
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def test_play(agent, game, n, accelerated=False): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=accelerated, display_screen=DISPLAY) cumulated = np.zeros(n, dtype=np.int32) for i in range(n): p.reset_game() while not p.game_over(): state = game.getGameState() qvals = agent.get_qvals(state) act = agent.greedy_action(qvals, 0) reward = p.act(ACTIONS[act]) if reward > 0: cumulated[i] += 1 print('Game:', i, ', doors:', cumulated[i]) average_score = np.mean(cumulated) max_score = np.max(cumulated) min_score = np.min(cumulated) print('\nTest over', n, 'tests:') print('average_score', 'max_score', 'min_score\n', average_score, max_score, min_score) return average_score, max_score, min_score
def train(nb_episodes, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -50000 avg_score = 0 episodes = 0 to_break = False while nb_episodes > 0: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) agent.frames += 1 score += reward if ((agent.frames % 10000) == 0): to_break = True # reset the environment if the game is over if env.game_over(): avg_score += score if score > biggest_score: biggest_score = score if biggest_score > 450: break print(biggest_score) print(nb_episodes) if nb_episodes % 100 == 0: print(avg_score / 100) if avg_score / 100 >= 5: break avg_score = 0 if to_break: break #print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 return biggest_score
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=110) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() # [None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def _test_ple(): from ple.games.pong import Pong from ple.games.flappybird import FlappyBird from ple import PLE # os.environ['SDL_VIDEODsRIVER'] = 'dummy' game = Pong() game = FlappyBird() ple_game = PLE(game, fps=30, display_screen=True) ple_game.init() ALLOWED_ACTIONS = ple_game.getActionSet() print(ALLOWED_ACTIONS) action = 0 start = time() t = 0 while True: ep_reward = 0 ple_game.reset_game() while not ple_game.game_over(): sleep(0.1) t += 1 if t % 15 == 5: action = 0 else: action = 1 reward = ple_game.act(ALLOWED_ACTIONS[action]) # print(reward) ep_reward += reward print(ep_reward, t, t / (time() - start))
class Env: def __init__(self): # initializing the instance of FlappyBird class self.game = FlappyBird(pipe_gap=100) # then pass this object into PLE constructor and create an instance of that self.env = PLE(self.game, fps=30, display_screen=False) # init does some necessary things under the hood self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary self.action_map = self.env.getActionSet() # function which takes an action def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
class PLEEnv(Env): def __init__(self, game, _id, render=True, reset_done=True, num_steps=100): super().__init__(_id, render, reset_done) self.num_steps = num_steps self.game = game self.start() def start(self): if not self.env_instance: self.env_instance = PLE(self.game, fps=30, display_screen=self.render) self.env_instance.init() def step(self, action): reward = self.env_instance.act(action) obs = self.env_instance.getGameState() done = self.env_instance.game_over() return obs, reward, done def reset(self): self.env_instance.reset_game() obs = self.env_instance.getGameState() return obs def close(self): pass def restart(self): self.close() self.reset()
def main(argv): try: opts, _ = getopt.getopt(argv, "hr") except getopt.GetoptError: print("birdML.py [-h | -r]") sys.exit(2) record = False for opt, arg in opts: if opt == '-h': print("-h to help") print("-r record") elif opt == '-r': record = True netb = netBrain() netb.summary() game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() actions = p.getActionSet() out = 1 epochs = 50 for i in range(epochs): lstates = [] rewards = [] if record: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc, 30.0, (288, 512)) for d in range(10): while not p.game_over(): if record: obs = p.getScreenRGB() obs = cv2.transpose(obs) obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR) out.write(obs) st = game.getGameState() gstate = list(st.values()) gstate = np.array([np.array(gstate)]) lstates.append(gstate[0]) pred = netb.predict(gstate)[0] a = pred.argmax() p.act(actions[a]) if st['next_pipe_bottom_y'] < st['player_y']: pred[0] = 1.0 pred[1] = 0.0 elif st['next_pipe_top_y'] > st['player_y']: pred[0] = 0.0 pred[1] = 1.0 rewards.append(pred) p.reset_game() netb.fit(np.array(lstates), np.array(rewards), batch_size=10, epochs=10) if record: out.release()
def play(self, n=1, file_path=None): # use "Fancy" for full background, random bird color and random pipe color, # use "Fixed" (default) for black background and constant bird and pipe colors. game = FlappyBird(graphics="fixed") # Note: if you want to see you agent act in real time, set force_fps to False. # But don't use this setting for learning, just for display purposes. env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Init the environment (settings, display...) env.init() # Load the model model = load_model(file_path) # Let's play n games, and see if the model is correctly trained for _ in range(n): env.reset_game() while not env.game_over(): S = self.get_game_data(game) Q = model.predict(S, batch_size=1) A = np.argmax(Q[0]) env.act(self.ACTIONS[A])
def evaluate(agent): env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): env.init() env.reset_game() obs = list(env.getGameState().values()) episode_reward = 0 while True: action = agent.predict(obs) observation = env.getScreenRGB() score = env.score() #action = agent.pickAction(reward, observation) observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) cv2.imshow("ss", observation) cv2.waitKey(10) # 预测动作,只选最优动作 reward = env.act(actionset[action]) obs = list(env.getGameState().values()) done = env.game_over() episode_reward += reward if done: break eval_reward.append(episode_reward) cv2.destroyAllWindows() return np.mean(eval_reward)
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() score = 0 tot_nb_episodes = nb_episodes average = 0 highscore = 0 over_50_count = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead state, ignore = agent.state_binner(env.game.getGameState()) action = agent.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over() or score >= 60: average += score if score > highscore: highscore = score if score >= 50: over_50_count += 1 print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("Average for {} runs {:.2f}".format(tot_nb_episodes, average / tot_nb_episodes)) over_50_p = (over_50_count / tot_nb_episodes) * 100 print("The percentage of scores over 50 is: %d" % (over_50_p)) return over_50_p
def test_agent(policy, file_writer=None, test_games=10, step=0): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env.init() test_rewards = [] for _ in range(test_games): env.reset_game() no_op(env) game_rew = 0 while not env.game_over(): state = flappy_game_state(env) action = 119 if policy(state) == 1 else None for _ in range(2): game_rew += env.act(action) test_rewards.append(game_rew) if file_writer is not None: summary = tf.Summary() summary.value.add(tag='test_performance', simple_value=game_rew) file_writer.add_summary(summary, step) file_writer.flush() return test_rewards
class Flappy(): def __init__(self, display=False): self.game = flappybird.FlappyBird(width=288, height=512, pipe_gap=100) self._ple = PLE(self.game, fps=30, display_screen=display) self.game.rng.seed(np.random.randint(0, 999999)) self.reset() def reset(self): self._ple.reset_game() self.game.init() self.game.backdrop.background_image.fill(0) ret = self.step(1) return ret[0] def render(self, close=False): if close: pygame.quit() def step(self, action): if action: reward = self._ple.act(119) else: reward = self._ple.act(0) done = self._ple.game_over() state = self.game.getScreenRGB() info = None return state, reward, done, info
def main_test(): final_score = 0 previous_action = 1 model = build_neural_network_model() game = FlappyBird(width=288, height=512, pipe_gap=100) env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state) model = load_model("model.h5") env.init() passed = 0 old_y = 0 for i in range(game_steps): if i == game_steps - 1: print("Score: {}".format(final_score)) if env.game_over(): print("Final Score: {}".format(final_score)) time.sleep(1) final_score = 0 env.reset_game() observation = env.getGameState() vector = model.predict(np.matrix(list(observation[0].values()))) a_star = np.argmax(vector[0]) print(vector[0][0], vector[0][1], a_star) time.sleep(0.05) env_reward = env.act(env.getActionSet()[a_star]) if env_reward == 1: final_score += 1
class PleEnvAdapter(EnvAdapter): """Pygame learning env adapter""" def __init__(self, *args, **kwargs): super(PleEnvAdapter, self).__init__(*args, **kwargs) if not self.render: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" Game = envs_lookup_table[self.env_name] self.env = PLE(Game(), display_screen=self.render, force_fps=not self.render) self.env.init() def get_input_shape(self): return (len(self.env.getGameState()), ) def reset(self): self.env.reset_game() def step(self, action) -> (object, float, bool): reward = self.env.act(self.env.getActionSet()[action]) observation = self.env.getGameState() observation = [val for key, val in observation.items()] done = self.env.game_over() return observation, reward, done def get_n_actions(self) -> int: return len(self.env.getActionSet()) def get_random_action(self): return random.randint(0, len(self.env.getActionSet()) - 1)
def train(nb_frames, agent): reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 avrage = [] count = [] nb_episodes = 0 number_of_frames = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) avrage.append(avg_score / 100) count.append(number_of_frames) avg_score = 0 #print("score for this episode: %d" % score) agent.calculate() env.reset_game() score = 0 print(biggest_score) data = {"Count": count, "Avrage": avrage} df = pd.DataFrame(data) sns.relplot(x="Count", y="Avrage", ci=None, kind="line", data=df)
def train(nb_frames, agent, a, g, results): print("alpha %f" % a) print("gamma %f" % g) reward_values = agent.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 biggest_score = -5 avg_score = 0 number_of_frames = 0 nb_episodes = 0 while number_of_frames < nb_frames: # pick an action state = env.game.getGameState() state = agent.state_binner(state) action = agent.training_policy(state) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # let the agent observe the current state transition newState = env.game.getGameState() newState = agent.state_binner(newState) agent.observe(state, action, reward, newState, env.game_over()) score += reward number_of_frames += 1 # reset the environment if the game is over if env.game_over(): nb_episodes += 1 avg_score += score if score > biggest_score: biggest_score = score print(biggest_score) print(nb_episodes) print(number_of_frames) if nb_episodes % 100 == 0: print(avg_score / 100) results[0].append(avg_score / 100) results[1].append(number_of_frames) results[2].append(a) results[3].append(g) avg_score = 0 #print("score for this episode: %d" % score) env.reset_game() score = 0 print(biggest_score) return results
def score(self, training=True, nb_episodes=10): reward_values = { 'positive': 1.0, 'negative': 0.0, 'tick': 0.0, 'loss': 0.0, 'win': 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() total_episodes = nb_episodes score = 0 scores = [] while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # reset the environment if the game is over if env.game_over() or score >= 100: scores.append(score) env.reset_game() nb_episodes -= 1 score = 0 avg_score = sum(scores) / float(len(scores)) print('Games played: {}'.format(total_episodes)) print('Average score: {}'.format(avg_score)) if training: score_file = '{}/scores.csv'.format(self.name) # If file doesn't exist, add the header if not os.path.isfile(score_file): with open(score_file, 'a') as f: f.write('avg_score,episode_count,num_of_frames,min,max\n') # Append scores to the file with open(score_file, 'a') as f: f.write('{},{},{},{},{}\n'.format(avg_score, self.num_of_episodes, self.num_of_frames, min(scores), max(scores))) else: with open('scores.txt', 'a') as f: for score in scores: f.write('{},{}\n'.format(self.name, score))
class PLEEnv(gym.Env): def __init__(self, env_config): game = Catcher(width=screen_wh, height=screen_wh) fps = 30 # fps we want to run at frame_skip = 2 num_steps = 2 force_fps = False # False for slower speed display_screen = True # make a PLE instance. self.env = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) self.env.init() self.action_dict = {0: None, 1: 97, 2: 100} #PLE env starts with black screen self.env.act(self.env.NOOP) self.action_space = Discrete(3) self.k = 4 self.observation_space = spaces.Box(low=0, high=255, shape=(screen_wh, screen_wh, 1 * self.k)) self.frames = deque([], maxlen=self.k) def reset(self): self.env.reset_game() # PLE env starts with black screen, NOOP step to get initial screen self.env.act(self.env.NOOP) ob = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) for _ in range(self.k): self.frames.append(ob) return self._get_ob() def step(self, action): #traditional gym env step #_obs, _rew, done, _info = env.step(env.action_space.sample()) action_value = self.action_dict[action] _rew = self.env.act(action_value) #_obs = self.env.getScreenGrayscale() _obs = np.reshape(self.env.getScreenGrayscale(), (screen_wh, screen_wh, 1)) self.frames.append(_obs) _done = self.env.game_over() _info = {} return self._get_ob(), _rew, _done, _info def _get_ob(self): assert len(self.frames) == self.k return np.concatenate(self.frames, axis=2)
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load('model95000') batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) # remember and replay agent.remember(state, action, reward, next_state, p.game_over()) if len(agent.memory) > batch_size: agent.replay(batch_size) state = next_state # save Model if i % 5000 == 0: print("Updating weights") agent.save('newmodel' + str(i)) agent.target_model.set_weights(agent.model.get_weights()) # Plot socre if i % 1000 == 0: plot(data)
class GameEnv(object): def __init__(self, display_screen): self.width = IMAGE_WIDTH self.height = IMAGE_HEIGHT self.count = 0 self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen) self.p.init() self._update_state() self.score = 0 def pre_process_image(self, image): self.count += 1 image = color.rgb2gray(image) image = transform.resize(image, (self.width, self.height)) image = exposure.rescale_intensity(image, out_range=(0, 255)) image = image.astype('float') image = image / 255.0 return image.reshape(1, self.width, self.height, 1) def _update_state(self): image = self.p.getScreenRGB() # TODO: convert to float image = self.pre_process_image(image) state = getattr(self, 'state', None) if state is None: self.state = np.concatenate([image] * 4, axis=3) else: self.state[:, :, :, :3] = image def get_state(self): return self.state def step(self, action): if action == 1: _ = self.p.act(119) else: _ = self.p.act(None) self._update_state() done = False if self.p.game_over(): done = True self.p.reset_game() reward = -1 else: reward = 0.1 return_score = self.score + reward self.score = 0 if done else self.score + reward return self.state, reward, done, return_score def get_score(self): return self.score
def run(number_of_episodes): game = FlappyBird(pipe_gap=150) rewards = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(game=game, fps=30, display_screen=True, reward_values=rewards, force_fps=False) # Reset environment at the beginning env.reset_game() score = 0 max_score = 0 episode_number = 1 while number_of_episodes > 0: # Get current state state = BasicQLearningAgent.get_state(env.game.getGameState()) # Select action in state "state" action = basic_q_agent.max_q(state) # After choosing action, get reward """ After choosing action, get reward. PLE environment method act() returns the reward that the agent has accumulated while performing the action. """ reward = env.act(env.getActionSet()[action]) score += reward max_score = max(score, max_score) game_over = env.game_over() if game_over: print("===========================") print("Episode: " + str(episode_number)) print("Score: " + str(score)) print("Max. score: " + str(max_score)) print("===========================\n") # f.write("Score: " + str(score) + "|Max. score: " + str(max_score) + "\n") episode_number += 1 number_of_episodes -= 1 score = 0 env.reset_game()
def train(self): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ if not os.path.exists(self.name): os.mkdir(self.name) t = threading.Thread(target=self.draw_plots) t.daemon = True t.start() reward_values = self.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 while self.frame_count <= 1000000: # pick an action state1 = env.game.getGameState() action = self.training_policy(state1) # step the environment reward = env.act(env.getActionSet()[action]) # print("reward=%d" % reward) state2 = env.game.getGameState() end = env.game_over( ) or score >= 100 # Stop after reaching 100 pipes self.observe(state1, action, reward, state2, end) # reset the environment if the game is over if end: env.reset_game() score = 0 if self.frame_count % 25000 == 0: print("==========================") print("episodes done: {}".format(self.episode_count)) print("frames done: {}".format(self.frame_count)) self.score() with open("{}/agent.pkl".format(self.name), "wb") as f: pickle.dump((self), f, pickle.HIGHEST_PROTOCOL) print("==========================")
def test(): game = Snake(600, 600) p = PLE(game, fps=60, state_preprocessor=process_state, force_fps=True, display_screen=True, frame_skip=2, reward_values={ "positive": 100.0, "negative": -50.0, "tick": -0.1, "loss": -70.0, "win": 5.0 }) agent = Agent(alpha=float(sys.argv[1]), gamma=float(sys.argv[2]), n_actions=3, epsilon=0.01, batch_size=100, input_shape=6, epsilon_dec=0.99999, epsilon_end=0.001, memory_size=500000, file_name=sys.argv[3], activations=[str(sys.argv[4]), str(sys.argv[5])]) p.init() agent.load_game() scores = [] for _ in range(200): if p.game_over(): p.reset_game() apples = 0 initial_direction = "Right" while not p.game_over(): old_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) action = agent.choose_action(old_state) possible_directions = prepare_corect_directions(initial_direction) possible_directions_tuples = list( zip(possible_directions.keys(), possible_directions.values())) direction = possible_directions_tuples[action] initial_direction = direction[1] reward = p.act(direction[0]) if reward > 50.0: apples += reward scores.append(apples) return scores
class Game(gym.Env): def __init__(self, display_screen=False, force_fps=True): os.environ["SDL_VIDEODRIVER"] = "dummy" game = FlappyBird() # define and initiate the environment self.env = PLE(game, fps=30, display_screen=display_screen, force_fps=force_fps) self.env.init() # list of actions in the environment self.actions = self.env.getActionSet() # length of actions self.action_space = spaces.Discrete(len(self.actions)) def step(self, action): """Take the action chosen and update the reward""" reward = self.env.act(self.actions[action]) state = self.getGameState() terminal = self.env.game_over() # If the bird is stuck, the game is over and a reward of -1000 # if it continues, +1 if terminal: reward = -1000 else: reward = 1 return state, reward, terminal, {} def getGameState(self): ''' PLEenv return gamestate as a dictionary. Returns a modified form of the gamestate only with the required information to define the state ''' state = self.env.getGameState() h_dist = state['next_pipe_dist_to_player'] v_dist = state['next_pipe_bottom_y'] - state['player_y'] vel = state['player_vel'] return ' '.join([str(vel), str(h_dist), str(v_dist)]) def reset(self): """Resets the game to start a new game""" self.env.reset_game() state = self.env.getGameState() return state def seed(self, seed): rng = np.random.RandomState(seed) self.env.rng = rng self.env.game.rng = self.env.rng self.env.init()
def main(train=False): # Don't modify anything in this function. # See the constants defined at the top of this file if you'd like to # change the FPS, screen size, or round length game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE) if train: p = PLE(game, fps=FPS, display_screen=False, force_fps=True) else: p = PLE(game, fps=FPS, display_screen=True, force_fps=False) p.init() agent_rounds = 0 cpu_rounds = 0 agent_score = 0 cpu_score = 0 num_frames = 0 while True: if p.game_over(): if game.score_counts['agent'] > game.score_counts['cpu']: agent_rounds += 1 print('AGENT won round') else: cpu_rounds += 1 print('CPU won round') if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS: break p.reset_game() obs = p.getGameState() action = agent(normalize(obs)) reward = p.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 print('AGENT scored') elif reward < 0: cpu_score += 1 print('CPU scored') num_frames += 1 winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU' print('Winner:', winner) print('Num frames :', num_frames) print('AGENT rounds won:',agent_rounds) print('CPU rounds won:',cpu_rounds) print('AGENT total score:',agent_score) print('CPU total score:',cpu_score)
def play(self): print("Playing {} agent after training for {} episodes or {} frames". format(self.name, self.episode_count, self.frame_count)) reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } env = PLE(FlappyBird(), fps=30, display_screen=True, force_fps=False, rng=None, reward_values=reward_values) env.init() score = 0 last_print = 0 nb_episodes = 50 while nb_episodes > 0: # pick an action state = env.game.getGameState() action = self.policy(state) # step the environment reward = env.act(env.getActionSet()[action]) score += reward # if reward == 1:`` # print(state) if score % 100 == 0 and score != last_print: print(int(score)) last_print = score # reset the environment if the game is over if env.game_over(): # print("---------------") # for s1, s2 in self.last_10: # print(s1) # print(s2) # print("-=-=-") print("Score: {}".format(score)) env.reset_game() nb_episodes -= 1 score = 0
def run_game(nb_episodes, agent): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ reward_values = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": 0.0, "win": 0.0 } # TODO: when training use the following instead: # reward_values = agent.reward_values env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) # TODO: to speed up training change parameters of PLE as follows: # display_screen=False, force_fps=True env.init() totalscore = 0 count = nb_episodes score = 0 while nb_episodes > 0: # pick an action # TODO: for training using agent.training_policy instead action = agent.policy(agent.state_binner(env.game.getGameState())) # step the environment reward = env.act(env.getActionSet()[action]) #print("reward=%d" % reward) # TODO: for training let the agent observe the current state transition score += reward # reset the environment if the game is over if env.game_over(): totalscore += score print(nb_episodes) print("score for this episode: %d" % score) env.reset_game() nb_episodes -= 1 score = 0 print("average for this run is :%d" % (totalscore / count)) return (totalscore / count)
def train(self): """ Runs nb_episodes episodes of the game with agent picking the moves. An episode of FlappyBird ends with the bird crashing into a pipe or going off screen. """ #Check if the agent folder exists #If not, create it. if not os.path.exists(self.name): print(self.name) os.mkdir(self.name) reward_values = self.reward_values() env = PLE(FlappyBird(), fps=30, display_screen=False, force_fps=True, rng=None, reward_values=reward_values) env.init() score = 0 while self.num_of_frames <= 1000000: # pick an action state1 = env.game.getGameState() action = self.training_policy(state1) reward = env.act(env.getActionSet()[action]) state2 = env.game.getGameState() end = env.game_over( ) or score >= 100 # Stop after reaching 100 pipes self.observe(state1, action, reward, state2, end) # reset the environment if the game is over if end: env.reset_game() score = 0 if self.num_of_frames % 25000 == 0: print('++++++++++++++++++++++++++') print('Episodes finished: {}'.format(self.num_of_episodes)) print('Number of frames: {}'.format(self.num_of_frames)) self.score() with open('{}/agent.pkl'.format(self.name), 'wb') as f: pickle.dump((self), f, pickle.HIGHEST_PROTOCOL) print('++++++++++++++++++++++++++\n')
def view_agent(agent): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() for i in range(200000): if p.game_over(): p.reset_game() time.sleep(0.03) action = agent.pick_action(p.getGameState()) p.act(action) if p.game_over(): break
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
""" def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False if f > 2250: env.display_screen = True env.force_fps = True
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''