class WaterWorld: def __init__(self, fps=30, display_screen=False): game = PyGameWaterWorld() self.game = PLE(game, fps=fps, display_screen=display_screen) action_set = self.game.getActionSet() self.action_map = {i: a for (i, a) in enumerate(action_set)} self.action_space = spaces.Discrete(len(self.action_map)) self.metadata = {'render.modes': ['human', 'rgb_array']} box = np.ones((48, 48, 3), dtype='float32') self.observation_space = spaces.Box(low=box * 0, high=box * 255) def reset(self): self.game.reset_game() return self.game.getScreenRGB() def step(self, action): a = self.action_map[action] r = self.game.act(a) done = self.game.game_over() info = {} return self.game.getScreenRGB(), r, done, info def close(self): pass
class SnakeEnv(object): def __init__(self): self.game = Snake() self.p = PLE(self.game, fps=30, display_screen=True) # self.actions = self.p.getActionSet() # self._action_space = list(range(self.actions[0])) # self._action_space.append(self.actions[-1]) self.action_space = self.p.getActionSet() def reset(self): self.p.init() self.p.act(None) return self.p.getScreenRGB() # return self.p.getScreenGrayscale() def step(self, action): reward = self.p.act(self.action_space[action]) # reward = self.p.act(119) # print(self.action_space[action], reward) return self.p.getScreenRGB(), reward, self.p.game_over() # return self.p.getScreenGrayscale(), reward, self.p.game_over() @property def action_space(self): return self._action_space @action_space.setter def action_space(self, action_space): self._action_space = action_space
class WrappedFlappyBird(): def __init__(self): self.score_counter = 0 self.game = FlappyBird() self.env = PLE(self.game, fps=30, display_screen=True) def frame_step(self, action_vector): if action_vector[0] == 1: self.env.act(119) elif action_vector[1] == 1: self.env.act(1) frame = self.env.getScreenRGB() reward = self.get_action_reward() game_over = self.game.game_over() if game_over: self.game.reset() return frame, reward, game_over def get_action_reward(self): if self.game.game_over(): self.score_counter = 0 return -1 elif self.score_counter < self.game.getScore(): self.score_counter = self.game.getScore() return 1 else: return 0.1
def evaluate(agent): env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): env.init() env.reset_game() obs = list(env.getGameState().values()) episode_reward = 0 while True: action = agent.predict(obs) observation = env.getScreenRGB() score = env.score() #action = agent.pickAction(reward, observation) observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) cv2.imshow("ss", observation) cv2.waitKey(10) # 预测动作,只选最优动作 reward = env.act(actionset[action]) obs = list(env.getGameState().values()) done = env.game_over() episode_reward += reward if done: break eval_reward.append(episode_reward) cv2.destroyAllWindows() return np.mean(eval_reward)
def main(argv): try: opts, _ = getopt.getopt(argv, "hr") except getopt.GetoptError: print("birdML.py [-h | -r]") sys.exit(2) record = False for opt, arg in opts: if opt == '-h': print("-h to help") print("-r record") elif opt == '-r': record = True netb = netBrain() netb.summary() game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() actions = p.getActionSet() out = 1 epochs = 50 for i in range(epochs): lstates = [] rewards = [] if record: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc, 30.0, (288, 512)) for d in range(10): while not p.game_over(): if record: obs = p.getScreenRGB() obs = cv2.transpose(obs) obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR) out.write(obs) st = game.getGameState() gstate = list(st.values()) gstate = np.array([np.array(gstate)]) lstates.append(gstate[0]) pred = netb.predict(gstate)[0] a = pred.argmax() p.act(actions[a]) if st['next_pipe_bottom_y'] < st['player_y']: pred[0] = 1.0 pred[1] = 0.0 elif st['next_pipe_top_y'] > st['player_y']: pred[0] = 0.0 pred[1] = 1.0 rewards.append(pred) p.reset_game() netb.fit(np.array(lstates), np.array(rewards), batch_size=10, epochs=10) if record: out.release()
class Game: def __init__(self, game="pixelcopter", fps=30): os.environ['SDL_VIDEODRIVER'] = 'dummy' self.game_name = game if game == "flappy": engine = FlappyBird() elif game == "pixelcopter": engine = Pixelcopter() else: assert False, "This game is not available" engine.rewards["loss"] = -5 # reward at terminal state self.reward_terminal = -5 self.game = PLE(engine, fps=fps, display_screen=False) self.game.init() self.game.act(0) # Start the game by providing arbitrary key as input self.key_input = self.game.getActionSet() self.reward = 0 def game_over(self): return self.game.game_over() def reset_game(self): self.game.reset_game() self.game.act(0) # Start the game def get_image(self): return self.game.getScreenRGB() def get_torch_image(self): image = self.game.getScreenRGB() if self.game_name == "flappy": image = image[:, :-96, :] # Remove ground image = cv2.cvtColor(cv2.resize(image, (84, 84)), cv2.COLOR_BGR2GRAY) image = np.reshape(image, (84, 84, 1)) elif self.game_name == "pixelcopter": image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = np.reshape(image, (48, 48, 1)) image[image > 0] = 1 image = image.transpose(2, 0, 1) #CHW image = image.astype(np.float32) image = torch.from_numpy(image) return image def act(self, action_idx): self.reward = self.game.act(self.key_input[action_idx]) return self.reward
class SnakeEnv(): def __init__(self, height=32, width=32, fps=15, frame_history_size=4): # create the game environment and initialize the attribute values self.game = Snake(height=height,width=width,init_length=4) reward_dict = {"positive": 1.0, "negative": -1.0, "tick": 0.0, "loss": -1.0, "win": 1.0} self.environment = PLE(self.game, fps=fps, reward_values=reward_dict, num_steps=2) self.init_env() # initialize the game self.allowed_actions = self.environment.getActionSet() # the list of allowed actions to be taken by an agent self.num_actions = len(self.allowed_actions) - 1 # number of actions that are allowed in this env self.frame_hist = frame_history(height=height,width=width,frame_history_size=frame_history_size,num_channels=3); self.input_shape = self.frame_hist.get_history().shape # shape of the game input screen def init_env(self): # initialize the variables and screen of the game self.environment.init() def get_current_state(self): # get the current state in the game. Returns the current screen of the game with snake and food positions with # a sequence of past . cur_frame = np.transpose(self.environment.getScreenRGB(),(2,0,1)) #cur_frame = np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1)) self.frame_hist.push(cur_frame) return self.frame_hist.get_history() def check_game_over(self): # check if the game has terminated return self.environment.game_over() def reset(self): # resets the game to initial values and refreshes the screen with a new small snake and random food position. self.environment.reset_game() _ = self.environment.act(None) self.frame_hist.reset(np.transpose(self.environment.getScreenRGB(),(2,0,1))) #self.frame_hist.reset(np.transpose(np.expand_dims(self.environment.getScreenGrayscale(),axis=0), (2, 0, 1))) return self.frame_hist.get_history() def take_action(self, action): # lets the snake take the chosen action of moving in some direction reward = self.environment.act(self.allowed_actions[action]) next_state = self.get_current_state() done = self.check_game_over() return next_state, reward, done, 0
def run_a_game(self,game): from ple import PLE p = PLE(game,display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward,obs))
def run_a_game(self, game): from ple import PLE p = PLE(game, display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward, obs))
class GameEnv(object): def __init__(self, display_screen): self.width = IMAGE_WIDTH self.height = IMAGE_HEIGHT self.count = 0 self.p = PLE(FlappyBird(), fps=30, display_screen=display_screen) self.p.init() self._update_state() self.score = 0 def pre_process_image(self, image): self.count += 1 image = color.rgb2gray(image) image = transform.resize(image, (self.width, self.height)) image = exposure.rescale_intensity(image, out_range=(0, 255)) image = image.astype('float') image = image / 255.0 return image.reshape(1, self.width, self.height, 1) def _update_state(self): image = self.p.getScreenRGB() # TODO: convert to float image = self.pre_process_image(image) state = getattr(self, 'state', None) if state is None: self.state = np.concatenate([image] * 4, axis=3) else: self.state[:, :, :, :3] = image def get_state(self): return self.state def step(self, action): if action == 1: _ = self.p.act(119) else: _ = self.p.act(None) self._update_state() done = False if self.p.game_over(): done = True self.p.reset_game() reward = -1 else: reward = 0.1 return_score = self.score + reward self.score = 0 if done else self.score + reward return self.state, reward, done, return_score def get_score(self): return self.score
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
def run(): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) #agent = myAgentHere(allowed_actions=p.getActionSet()) p.init() reward = 0.0 for i in range(150): if p.game_over(): p.reset_game() observation = p.getScreenRGB() new_image = convert_image(observation) cv.imwrite("Imagenes/Gray_Image" + str(i) + ".jpg", new_image) action = None reward = p.act(action)
def test_model_G(nb_games, model): game = FlappyBird( graphics="fixed" ) # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p.init() reward = 0.0 cumulated = np.zeros((nb_games)) list_actions = [0, 119] for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen_x = process_screen(p.getScreenRGB()) stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) x = np.stack(stacked_x, axis=-1) action = list_actions[np.argmax( model.predict(np.expand_dims(x, axis=0)))] reward = p.act(action) cumulated[i] = cumulated[i] + reward avg_score = np.mean(cumulated) print('Average : ' + str(avg_score)) mx_score = np.max(cumulated) print('Max : ' + str(mx_score)) return avg_score, mx_score
def test(): game2 = FlappyBird() p2 = PLE(game2, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p2.init() reward = 0.0 nb_games = 10 cumulated = np.zeros((nb_games)) for i in range(nb_games): p2.reset_game() while (not p2.game_over()): state = game2.getGameState() screen = p2.getScreenRGB() action = FlappyPolicy(state, screen) reward = p2.act(action) cumulated[i] = cumulated[i] + reward return np.mean(cumulated)
def main(args): logs_path = args.logs_path video_path = args.video_path restore = args.restore train = args.train # Initial PLE environment os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" # Design reward reward_values = { "positive": 1, "tick": 0.1, "loss": -1, } env = PLE(FlappyBird(), fps=30, display_screen=False, reward_values=reward_values) action_set = env.getActionSet() reply_buffer = Reply_Buffer(Config.reply_buffer_size) agent = Agent(action_set) reward_logs = [] loss_logs = [] # restore model if restore: agent.restore(restore) for episode in range(1, Config.total_episode+1): # reset env env.reset_game() env.act(0) obs = convert(env.getScreenGrayscale()) state = np.stack([[obs for _ in range(4)]], axis=0) t_alive = 0 total_reward = 0 if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: agent.stop_epsilon() frames = [env.getScreenRGB()] while not env.game_over(): action = agent.take_action(state) reward = env.act(action_set[action]) if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: frames.append(env.getScreenRGB()) obs = convert(env.getScreenGrayscale()) obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]]) state_new = np.append(state[:, 1:,...], obs, axis=1) action_onehot = np.zeros(len(action_set)) action_onehot[action] = 1 t_alive += 1 total_reward += reward reply_buffer.append((state, action_onehot, reward, state_new, env.game_over())) state = state_new # save video # if episode % Config.save_video_frequency == 0 and episode > Config.initial_observe_episode: # os.makedirs(video_path, exist_ok=True) # clip = make_video(frames, fps=60).rotate(-90) # clip.write_videofile(os.path.join(video_path, 'env_{}.mp4'.format(episode)), fps=60) # agent.restore_epsilon() # print('Episode: {} t: {} Reward: {:.3f}' .format(episode, t_alive, total_reward)) if episode > Config.initial_observe_episode and train: # save model if episode % Config.save_logs_frequency == 0: agent.save(episode, logs_path) np.save(os.path.join(logs_path, 'loss.npy'), np.array(loss_logs)) np.save(os.path.join(logs_path, 'reward.npy'), np.array(reward_logs)) # update target network if episode % Config.update_target_frequency == 0: agent.update_target_network() # sample batch from reply buffer batch_state, batch_action, batch_reward, batch_state_new, batch_over = reply_buffer.sample(Config.batch_size) # update policy network loss = agent.update_Q_network(batch_state, batch_action, batch_reward, batch_state_new, batch_over) loss_logs.extend([[episode, loss]]) reward_logs.extend([[episode, total_reward]]) # print reward and loss if episode % Config.show_loss_frequency == 0: print('Episode: {} t: {} Reward: {:.3f} Loss: {:.3f}' .format(episode, t_alive, total_reward, loss)) agent.update_epsilon()
class Environment(object): def __init__(self, env_name, args, atari_wrapper=False, test=False, seed=595): game = FlappyBird(width=144, height=256, pipe_gap=80) self.test = test #define reward reward_func = rewards = { "positive": 1, "negative": -1.0, "tick": 1, "loss": -5.0, "win": 1.0 } self.p = PLE(game, fps=30, display_screen=False, force_fps=True, reward_values=reward_func, rng=seed) self.observation = np.zeros((144, 256, 4, 3)) # if atari_wrapper: # clip_rewards = not test # self.env = make_wrap_atari(env_name, clip_rewards) # else: # self.env = gym.make(env_name) self.action_space = self.p.getActionSet() # self.observation_space = self.env.observation_space def reset(self): ''' When running dqn: observation: np.array stack 4 last frames, shape: (84, 84, 4) When running pg: observation: np.array current RGB screen of game, shape: (210, 160, 3) ''' self.p.reset_game() observation = self.p.getScreenRGB() self.observation[:, :, 0:-1, :] = self.observation[:, :, 1:, :] self.observation[:, :, -1, :] = observation return self.observation.reshape(144, 256, 12) def step(self, action): reward = self.p.act(action) observation = self.p.getScreenRGB() if self.p.game_over(): done = True else: done = False self.observation[:, :, 0:-1, :] = self.observation[:, :, 1:, :] self.observation[:, :, -1, :] = observation return self.observation.reshape(144, 256, 12), reward, done, None def get_action_space(self): return self.action_space # def get_observation_space(self): # return self.observation_space def get_random_action(self): return self.action_space.sample()
dqn_target = load_model(filepath=path_model) # Init game game = FlappyBird(graphics="fixed") # TODO: considering to change frame_skip ? # frame_skip=4 for some atari games, 3 for others.... To change? if params.DISPLAY_GAME: p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) else: p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen='store_false') p.init() # Training p.reset_game() screen_x = process_screen(p.getScreenRGB()) stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) x = np.stack(stacked_x, axis=-1) replay_memory = MemoryBuffer(params.REPLAY_MEMORY_SIZE, screen_x.shape, (1,)) # Evaluation barrier mean_score = 0 training_score = 0 # Deep Q-learning with experience replay for step in range(params.TOTAL_STEPS): logger_train.debug("Step {} / {} ----> epsilon={}".format(step, params.TOTAL_STEPS, epsilon(step))) print("Step {} / {} ----> epsilon={}".format(step, params.TOTAL_STEPS, epsilon(step))) if step % params.EVALUATION_PERIOD == 0 and step > 0 and params.EVALUATION and mean_score < 120: logger_train.info("Evaluating...")
def experiment(device, reward_system, PIPEGAP, BATCH_SIZE, learning_rate, MEMORY_SIZE, GAMMA, EPS_START, EPS_END, EPS_DECAY, OBSERVE, FRAME_PER_ACTION, TARGET_UPDATE, num_episodes, save_model=False, load_model=False, load_model_path_prefix=None): expected_q_value = 0 policy_net = RL.DQN().to(device) target_net = RL.DQN().to(device) if load_model: policy_net.load_state_dict( torch.load(load_model_path_prefix + "_policy_net.mdl")) target_net.load_state_dict( torch.load(load_model_path_prefix + "_target_net.mdl")) else: target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate) memory = RL.ReplayMemory(MEMORY_SIZE) #Setup Game environment game = FlappyBird.FlappyBird(pipe_gap=PIPEGAP) env = PLE(game, fps=30, display_screen=True, force_fps=True, reward_values=reward_system) #Setup plot RLplot.plot_init() episode_durations = [] # Main part with game execution env.init() steps_done = 0 infinity = False for i_episode in range(num_episodes): # Initialize the environment and state env.reset_game() state = env.getScreenRGB() state = RLip.BCHW_format(state) frames = (state, state, state, state) state = RLip.last_4_frames(state, frames[1], frames[2], frames[3]) for t in count(): # Select an action action, steps_done = RL.select_action(state, policy_net, steps_done, device, EPS_START, EPS_END, EPS_DECAY, OBSERVE) if steps_done % FRAME_PER_ACTION != 0: action = torch.tensor([[1]], device=device, dtype=torch.long) # Perform an action reward = env.act(env.getActionSet()[action[0, 0]]) next_state = env.getScreenRGB() done = env.game_over() reward = torch.tensor([reward], device=device) # Formatting next state for network if not done: next_state = RLip.BCHW_format(next_state) frames = (next_state, frames[0], frames[1], frames[2]) next_state = RLip.last_4_frames(next_state, frames[1], frames[2], frames[3]) else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # edit # Move to the next state state = next_state # Print Log of training info if steps_done <= OBSERVE: state_of_training = "observe" elif steps_done > OBSERVE and steps_done <= OBSERVE + EPS_DECAY: state_of_training = "explore" else: state_of_training = "train" print("TIMESTEP", steps_done, "/ STATE", state_of_training,\ "/ ACTION", action[0,0].data,"/ REWARD", reward[0].data,"/ Expected_Q",expected_q_value) # Perform one step of the optimization (on the target network) if steps_done > OBSERVE: RL.optimize_model(policy_net, target_net, memory, optimizer, device, BATCH_SIZE, GAMMA) if done: episode_durations.append(t + 1) RLplot.plot_durations(episode_durations) break if t > 10000: infinity = True episode_durations.append(t + 1) RLplot.plot_durations(episode_durations) break else: if done: break # Update the target network if i_episode % TARGET_UPDATE == 0 and steps_done > OBSERVE: target_net.load_state_dict(policy_net.state_dict()) if infinity: break # End training process # Save experiment result data = { "data": episode_durations, 'pipe_gap': PIPEGAP, 'reward_values': reward_system, 'BATCH_SIZE': BATCH_SIZE, 'learning_rate': learning_rate, 'MEMORY_SIZE': MEMORY_SIZE, 'GAMMA': GAMMA, 'EPS_START': EPS_START, 'EPS_END': EPS_END, 'EPS_DECAY': EPS_DECAY, 'OBSERVE': OBSERVE, 'FRAME_PER_ACTION': FRAME_PER_ACTION, 'TARGET_UPDATE': TARGET_UPDATE, 'num_episodes': num_episodes } filenameprefix = './result/Expe_' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') filename = filenameprefix + '.pkl' with open(filename, 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) # Save model if said so if save_model: torch.save(policy_net.state_dict(), filenameprefix + '_policy_net.mdl') torch.save(target_net.state_dict(), filenameprefix + '_target_net.mdl') # Save plot figure plotname = filenameprefix + '.png' RLplot.plot_end(plotname)
class PLEFlappyBird(): """ PyGame Learning Environment for use only with FlappyBird. Does pre-processing specific to FlappyBird game. """ def __init__(self, render=False, seed=0, pipe_gap=100): self.seed = seed print('SEED: {}'.format(self.seed)) game = FlappyBird(pipe_gap=pipe_gap) self.env = PLE(game, fps=30, display_screen=render, rng=seed) self.env.init() self.full_state = np.zeros((1, 4, 80, 80), dtype=np.uint8) self.frame_sleep = 0.02 def _prepro(self, frame): """Pre-process 288x512x3 uint8 frame into 80x80 uint8 frame.""" frame = frame[:, :, 2] # drop to one color channel frame = frame.T # rotate 90 degrees frame[frame == 140] = 0 # filter out background frame[frame == 147] = 0 frame[frame == 160] = 0 frame[frame == 194] = 0 frame[frame == 210] = 0 frame[frame != 0] = 255 # set everything else to 255 frame = cv2.resize(frame, (80, 80)) # downsample #show_frame(frame) # DEBUG return frame def _add_frame(self, frame): """ Add single frame to state. Used for processing multiple states over time.""" self.full_state[:, 3, :, ::] = self.full_state[:, 2, :, ::] self.full_state[:, 2, :, ::] = self.full_state[:, 1, :, ::] self.full_state[:, 1, :, ::] = self.full_state[:, 0, :, ::] self.full_state[:, 0, :, ::] = frame def reset(self): """Reset the environment.""" self.env.reset_game() frame = self.env.getScreenRGB() #print('reset() frame from environment: {}'.format(frame.shape)) # DEBUG frame = self._prepro(frame) #print('reset() frame after _prepro(): {}'.format(frame.shape)) # DEBUG frame = np.expand_dims(frame, axis=0) #print('reset() frame after reshape: {}'.format(frame.shape)) # DEBUG self._add_frame(frame) self._add_frame(frame) self._add_frame(frame) self._add_frame(frame) #print('reset(): {}'.format(self.full_state.shape)) # DEBUG #show_frames_2d(self.full_state) # DEBUG return self.full_state.copy() def step(self, action): """Take a step in the environment.""" reward = self.env.act(action) frame = self.env.getScreenRGB() done = True if self.env.game_over() else False #print('step() frame from environment: {}'.format(frame)) # DEBUG frame = self._prepro(frame) #print('step() frame after _prepro(): {}'.format(frame)) # DEBUG frame = np.expand_dims(frame, axis=0) #print('step() frame after reshape: {}'.format(frame)) # DEBUG self._add_frame(frame) #print('step(): {}'.format(self.full_state)) # DEBUG #show_frames_2d(self.full_state) # DEBUG return self.full_state.copy(), reward, done def render(self): """ Render the environment to visualize the agent interacting. Does nothing because rendering is handled by setting display_screen=True when creating the PLE() object. """ pass
class OriginalGameEnv(gym.Env): def __init__(self, task={}): self._task = task os.environ['SDL_VIDEODRIVER'] = 'dummy' import importlib game_module = importlib.import_module('ple.games.originalgame') game = getattr(game_module, 'originalGame')() self.game_state = PLE(game, fps=30, display_screen=False) self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.num_actions = len(self._action_set) self.viewer = None def seed(self, seed=None): if not seed: seed = np.random.randint(2**31 - 1) rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init() return [seed] def reset_task(self, task): pass def render(self, mode='human'): img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated def step(self, action): reward = self.game_state.act(self._action_set[action]) state = self._get_image() terminal = self.game_state.game_over() return state, reward, terminal, {}
from ple.games.flappybird import FlappyBird from ple import PLE import random game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=False) p.init() nb_frames = 1000 reward = 0.0 for f in range(nb_frames): if p.game_over(): #check if the game is over p.reset_game() obs = p.getScreenRGB() action = random.sample(p.getActionSet(), 1)[0] reward = p.act(action) print(action, reward)
# env1.reset() # for _ in range(1000): # env.render() # env.step(env.action_space.sample()) # take a random action # env1.render() # env1.step(env1.action_space.sample()) # take a random action # from ple.games.pong import Pong # from ple import PLE # game = Pong() # p = PLE(game, fps=30, display_screen=True, force_fps=False) # p.init() # from ple.games.flappybird import FlappyBird from ple import PLE game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() reward = 0.0 for i in range(nb_frames): if p.game_over(): p.reset_game() observation = p.getScreenRGB() action = agent.pickAction(reward, observation) reward = p.act(action)
""" def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False if f > 2250: env.display_screen = True env.force_fps = True
import numpy as np import pygame from pygame.locals import * class TestAgent(): def __init__(self, actions): self.actions = actions def doAction(self,reward,obs): #print 'hello' for event in pygame.event.get(): if event.type == KEYDOWN: return self.actions[0] return None game = RunningMinion() #game = WaterWorld() p = PLE(game, fps=30, display_screen=True) agent = TestAgent(p.getActionSet()) p.init() reward = 0.0 nb_frames = 2000 for i in range(nb_frames): if p.game_over(): p.reset_game() if i%1==0: obser = p.getScreenRGB() action = agent.doAction(reward,obser) reward = p.act(action)
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) p.init() episode_counter = 0 counter = 0 # counter to control the reduction of epsilon # store the previous observations in replay memory D = deque() # First action, don't flap. p.act(ACTIONS[0]) x_t = p.getScreenRGB() terminal = p.game_over() x_t = skimage.color.rgb2gray(x_t) x_t = skimage.transform.resize(x_t, (80, 80)) x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #In Keras, need to reshape s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 #We go to training mode epsilon = INITIAL_EPSILON t = 0
# 加载模型 save_path = '.\model_dir\model_6700_2823.0.ckpt' #episode_reward: 1785.0 agent.restore(save_path) obs = list(env.getGameState().values()) # #处理obs # obs = preprocess(obs) episode_reward = 0 while True: # 预测动作,只选最优动作 action = agent.predict(obs) # 图像太快休眠 # time.sleep(0.02) #延迟单位为秒 # # 新建窗口显示分数 observation = env.getScreenRGB() score = env.score() # 格式转换 observation = cv2.cvtColor(observation, cv2.COLOR_RGB2BGR) # 选择90度 observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, "score:" + str(int(score)), (0, 30), font, 0.6, (0, 0, 255), 2) cv2.imshow("flappybird", observation) cv2.waitKey(5) reward = env.act(actionset[action]) obs = list(env.getGameState().values()) # #处理obs # obs = preprocess(obs)
if next_pipe_bottom_y - 8 < player_pos_y: return True return False agent = NaiveAgent(p.getActionSet()) print p.getActionSet() reward = 0.0 for i in range(nb_frames): if p.game_over(): p.reset_game() observation = p.getScreenRGB() action = agent.pickAction(reward, observation) reward = p.act(action) state = game.getGameState() player_y = state["player_y"] distance = state["next_pipe_dist_to_player"] width = state["next_pipe_width"] #player_x = state["player_x"] pipe_x = state["next_pipe_x"] #dist = previousState["next_pipe_dist_to_player"] print distance print width #print player_x print pipe_x #print player_y
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
class Agent: LEARNING_RATE = 1e-6 BATCH_SIZE = 32 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 INITIAL_IMAGES = np.zeros((80, 80, 4)) # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.model = Model(self.OUTPUT_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.OUTPUT_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.trainable = tf.trainable_variables() self.rewards = [] def _assign(self): for i in range(len(self.trainable) // 2): assign_op = self.trainable[i + len(self.trainable) // 2].assign( self.trainable[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _get_image(self, image): r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size=(80, 80)) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) Q_new_negative = sess.run( self.model_negative, feed_dict={self.model_negative.X: new_states}) replay_size = len(replay) X = np.empty((replay_size, 80, 80, 4)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * Q_new_negative[ i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.model.logits, feed_dict={self.model.X: inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for k in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:, :, k] = state dead = False while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign() action = self._select_action(self.INITIAL_IMAGES) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() state = self._get_image(self.env.getScreenRGB()) new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis=2) dead = self.env.game_over() self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={ self.X: X, self.Y: Y }) self.INITIAL_IMAGES = new_state self.T_COPY += 1 self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + ( 1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
def evaluate(agent1, agent2, agent3): input("开始比赛") fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') frame_number = 0 env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): output_movie = cv2.VideoWriter(videoname + '_' + str(i) + '.mp4', fourcc, 20, (288, 512)) env.init() env.reset_game() dstate = env.getGameState() # print(dstate) obs = list(dstate.values()) last_obs = np.zeros_like(obs[0:8]) episode_reward = 0 while True: obs1 = obs[0:8] obs2 = obs[8:16] obs3 = obs[16:24] action1 = agent1.predict(obs1) action2 = agent2.predict(obs2) action3 = agent3.predict(last_obs, obs3) finalaction = 0 if action1 == 0: finalaction += 1 if action2 == 0: finalaction += 2 if action3 == 0: finalaction += 4 # print("action1: ", action1) # print("action2: ", action2) # print("action3: ", action3) # print("action: ", finalaction) # print(obs) # print(obs1) # print(obs2) # print(obs3) if finalaction == 0: finalaction = None score = env.score() observation = env.getScreenRGB() observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) ss = observation.shape observation = cv2.resize(observation, (ss[1] * 2, ss[0] * 2)) output_movie.write(observation) cv2.imshow("ss", observation) cv2.waitKey(30) # 预测动作,只选最优动作 reward = env.act(finalaction) last_obs = obs3 dstate = env.getGameState() # print(dstate) obs = list(dstate.values()) done = env.game_over() episode_reward += reward if done: break # input() eval_reward.append(episode_reward) cv2.destroyAllWindows() output_movie.release() input() return np.mean(eval_reward)
import FlappyPolicy game = FlappyBird() p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action = FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
display_screen=display_screen) # our Naive agent! agent = NaiveAgent(env.getActionSet()) # init agent and game. env.init() # lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = env.act(env.NOOP) # start our training loop for f in range(nb_frames): # if the game is over if env.game_over(): env.reset_game() obs = env.getScreenRGB() action = agent.pickAction(reward, obs) reward = env.act(action) # if f % 50 == 0: # p.saveScreen("tmp/screen_capture.png") print f if f > 50: env.display_screen = True env.force_fps = True
class Agent: LEARNING_RATE = 1e-6 BATCH_SIZE = 32 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 INITIAL_IMAGES = np.zeros((80, 80, 4)) # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState def conv_layer(x, conv, stride=1): return tf.nn.conv2d(x, conv, [1, stride, stride, 1], padding='SAME') def pooling(x, k=2, stride=2): return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding='SAME') self.X = tf.placeholder(tf.float32, [None, 80, 80, 4]) self.Y = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) w_conv1 = tf.Variable(tf.truncated_normal([8, 8, 4, 32], stddev=0.1)) b_conv1 = tf.Variable(tf.truncated_normal([32], stddev=0.01)) conv1 = tf.nn.relu(conv_layer(self.X, w_conv1, stride=4) + b_conv1) pooling1 = pooling(conv1) w_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.1)) b_conv2 = tf.Variable(tf.truncated_normal([64], stddev=0.01)) conv2 = tf.nn.relu(conv_layer(pooling1, w_conv2, stride=2) + b_conv2) w_conv3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.1)) b_conv3 = tf.Variable(tf.truncated_normal([64], stddev=0.01)) conv3 = tf.nn.relu(conv_layer(conv2, w_conv3) + b_conv3) pulling_size = int(conv3.shape[1]) * int(conv3.shape[2]) * int( conv3.shape[3]) conv3 = tf.reshape(conv3, [-1, pulling_size]) self.tensor_action, self.tensor_validation = tf.split(conv3, 2, 1) w_action1 = tf.Variable( tf.truncated_normal([pulling_size // 2, 256], stddev=0.1)) w_action2 = tf.Variable( tf.truncated_normal([256, self.OUTPUT_SIZE], stddev=0.1)) w_validation1 = tf.Variable( tf.truncated_normal([pulling_size // 2, 256], stddev=0.1)) w_validation2 = tf.Variable(tf.truncated_normal([256, 1], stddev=0.1)) fc_action1 = tf.nn.relu(tf.matmul(self.tensor_action, w_action1)) fc_action2 = tf.matmul(fc_action1, w_action2) fc_validation1 = tf.nn.relu( tf.matmul(self.tensor_validation, w_validation1)) fc_validation2 = tf.matmul(fc_validation2, w_validation2) self.logits = fc_validation2 + tf.subtract( fc_action2, tf.reduce_mean(fc_action2, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _get_image(self, image): r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size=(80, 80)) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) replay_size = len(replay) X = np.empty((replay_size, 80, 80, 4)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X: inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for k in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:, :, k] = state dead = False while not dead: action = self._select_action(self.INITIAL_IMAGES) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() state = self._get_image(self.env.getScreenRGB()) new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis=2) dead = self.env.game_over() self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={ self.X: X, self.Y: Y }) self.INITIAL_IMAGES = new_state self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + ( 1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
reward = 0.0 max_noops = 20 nb_frames = 15000 #make a PLE instance. p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) #our Naive agent! agent = NaiveAgent(p.getActionSet()) #init agent and game. p.init() #lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = p.act(p.NOOP) #start our training loop for f in range(nb_frames): #if the game is over if p.game_over(): p.reset_game() obs = p.getScreenRGB() action = agent.pickAction(reward, obs) reward = p.act(action) if f % 50 == 0: p.saveScreen("screen_capture.png")
class PLEEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None def _step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_image() terminal = self.game_state.game_over() return state, reward, terminal, {} def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
class Agent: MEMORY_SIZE = 300 BATCH = 32 POPULATION_SIZE = 15 SIGMA = 0.1 LEARNING_RATE = 0.03 EPSILON = 1 MIN_EPSILON = 0.1 WATCHING = 10000 FEATURES = 8 GAMMA = 0.99 MEMORIES = deque() INITIAL_IMAGES = np.zeros((80, 80, 4)) # based on documentation, features got 8 dimensions def __init__(self, model, screen=False, forcefps=True): self.model = model self.game = MonsterKong() self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE) def _get_image(self, image): r, g, b = image[:,:,0], image[:,:,1], image[:,:,2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size = (80, 80)) def _map_action(self, action): if action == 0: return 97 if action == 1: return 100 if action == 2: return 119 if action == 3: return 115 if action == 4: return 32 def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.model.predict(states) Q_new = self.model.predict(new_states) replay_size = len(replay) X = np.empty((replay_size, 80, 80, 4)) Y = np.empty((replay_size, 5)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, done_r = replay[i] target = Q[i] target[action_r] = reward_r if not done_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target return X, Y def get_predicted_action(self, sequence): if random.random() > self.EPSILON: prediction = np.argmax(self.model.predict(np.array(sequence))[0]) else: prediction = np.random.randint(5) self.EPSILON -= (self.EPSILON / self.WATCHING) return prediction def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def save(self, checkpoint_name): with open('%s-weight.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.model.get_weights(), fopen) def load(self, checkpoint_name): with open('%s-weight.p'%(checkpoint_name), 'rb') as fopen: self.model.set_weights(pickle.load(fopen)) def get_reward(self, weights): self.model.set_weights(weights) self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for i in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:,:,i] = state dead = False while not dead: action = self.get_predicted_action([self.INITIAL_IMAGES]) real_action = self._map_action(action) reward = self.env.act(real_action) reward += random.choice([0.0001, -0.0001]) state = self._get_image(self.env.getScreenRGB()) new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis = 2) dead = self.env.game_over() self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead) self.INITIAL_IMAGES = new_state batch_size = min(len(self.MEMORIES), self.BATCH) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) actions = self.model.predict(X) return -np.mean(np.square(Y - actions)) def fit(self, iterations, checkpoint): self.es.train(iterations,print_every=checkpoint) def play(self, debug=False, not_realtime=False): total_reward = 0.0 current_reward = 0 self.env.force_fps = not_realtime self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for k in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:,:,k] = state done = False while not done: state = self.get_state() action = np.argmax(self.predict(np.array([self.INITIAL_IMAGES]))[0]) real_action = 119 if action == 1 else None action_string = 'eh, jump!' if action == 1 else 'erm, do nothing..' if debug and total_reward > current_reward: print(action_string, 'total rewards:', total_reward) current_reward = total_reward total_reward += self.env.act(real_action) done = self.env.game_over() print('game over!')