class PLEEnv(Env): def __init__(self, game, _id, render=True, reset_done=True, num_steps=100): super().__init__(_id, render, reset_done) self.num_steps = num_steps self.game = game self.start() def start(self): if not self.env_instance: self.env_instance = PLE(self.game, fps=30, display_screen=self.render) self.env_instance.init() def step(self, action): reward = self.env_instance.act(action) obs = self.env_instance.getGameState() done = self.env_instance.game_over() return obs, reward, done def reset(self): self.env_instance.reset_game() obs = self.env_instance.getGameState() return obs def close(self): pass def restart(self): self.close() self.reset()
def evaluate(agent): env = PLE(game, fps=30, display_screen=True) actionset = env.getActionSet() eval_reward = [] for i in range(5): env.init() env.reset_game() obs = list(env.getGameState().values()) episode_reward = 0 while True: action = agent.predict(obs) observation = env.getScreenRGB() score = env.score() #action = agent.pickAction(reward, observation) observation = cv2.transpose(observation) font = cv2.FONT_HERSHEY_SIMPLEX observation = cv2.putText(observation, str(int(score)), (0, 25), font, 1.2, (255, 255, 255), 2) cv2.imshow("ss", observation) cv2.waitKey(10) # 预测动作,只选最优动作 reward = env.act(actionset[action]) obs = list(env.getGameState().values()) done = env.game_over() episode_reward += reward if done: break eval_reward.append(episode_reward) cv2.destroyAllWindows() return np.mean(eval_reward)
class PleEnvAdapter(EnvAdapter): """Pygame learning env adapter""" def __init__(self, *args, **kwargs): super(PleEnvAdapter, self).__init__(*args, **kwargs) if not self.render: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" Game = envs_lookup_table[self.env_name] self.env = PLE(Game(), display_screen=self.render, force_fps=not self.render) self.env.init() def get_input_shape(self): return (len(self.env.getGameState()), ) def reset(self): self.env.reset_game() def step(self, action) -> (object, float, bool): reward = self.env.act(self.env.getActionSet()[action]) observation = self.env.getGameState() observation = [val for key, val in observation.items()] done = self.env.game_over() return observation, reward, done def get_n_actions(self) -> int: return len(self.env.getActionSet()) def get_random_action(self): return random.randint(0, len(self.env.getActionSet()) - 1)
def test_movement_up(): game = Pong() p = PLE(game, display_screen=True, fps=20, force_fps=1) p.init() time.sleep(.5) oldState = p.getGameState() p.act(game.actions["up"]) newState = p.getGameState() assert oldState["player_velocity"] > newState["player_velocity"]
class Game(gym.Env): def __init__(self, display_screen=False, force_fps=True): os.environ["SDL_VIDEODRIVER"] = "dummy" game = FlappyBird() # define and initiate the environment self.env = PLE(game, fps=30, display_screen=display_screen, force_fps=force_fps) self.env.init() # list of actions in the environment self.actions = self.env.getActionSet() # length of actions self.action_space = spaces.Discrete(len(self.actions)) def step(self, action): """Take the action chosen and update the reward""" reward = self.env.act(self.actions[action]) state = self.getGameState() terminal = self.env.game_over() # If the bird is stuck, the game is over and a reward of -1000 # if it continues, +1 if terminal: reward = -1000 else: reward = 1 return state, reward, terminal, {} def getGameState(self): ''' PLEenv return gamestate as a dictionary. Returns a modified form of the gamestate only with the required information to define the state ''' state = self.env.getGameState() h_dist = state['next_pipe_dist_to_player'] v_dist = state['next_pipe_bottom_y'] - state['player_y'] vel = state['player_vel'] return ' '.join([str(vel), str(h_dist), str(v_dist)]) def reset(self): """Resets the game to start a new game""" self.env.reset_game() state = self.env.getGameState() return state def seed(self, seed): rng = np.random.RandomState(seed) self.env.rng = rng self.env.game.rng = self.env.rng self.env.init()
def test(): # 创建环境 game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) act_dim = len(env.getActionSet()) print('action set:', env.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './DQN/checkpoints/episode_V14600.ckpt' print('checkpoints:', save_path) if os.path.exists(save_path): logger.info('load ckpt success!') agent.restore(save_path) else: logger.error('load ckpt error!') action_set = env.getActionSet() env.init() episode_reward = 0 steps = 0 while not env.game_over(): steps += 1 if (steps == 1): continue obs = list(env.getGameState().values()) action_idx = agent.predict(obs) # 预测动作,只选最优动作 act = action_set[action_idx] reward = env.act(act) episode_reward += reward reward_str = str(int(episode_reward)) drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0), (255, 255, 255)) env.reset_game() logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True, force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") eval_reward = evaluate(agent, penv)
class Env: def __init__(self): # initializing the instance of FlappyBird class self.game = FlappyBird(pipe_gap=100) # then pass this object into PLE constructor and create an instance of that self.env = PLE(self.game, fps=30, display_screen=False) # init does some necessary things under the hood self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary self.action_map = self.env.getActionSet() # function which takes an action def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def main_test(): final_score = 0 previous_action = 1 model = build_neural_network_model() game = FlappyBird(width=288, height=512, pipe_gap=100) env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state) model = load_model("model.h5") env.init() passed = 0 old_y = 0 for i in range(game_steps): if i == game_steps - 1: print("Score: {}".format(final_score)) if env.game_over(): print("Final Score: {}".format(final_score)) time.sleep(1) final_score = 0 env.reset_game() observation = env.getGameState() vector = model.predict(np.matrix(list(observation[0].values()))) a_star = np.argmax(vector[0]) print(vector[0][0], vector[0][1], a_star) time.sleep(0.05) env_reward = env.act(env.getActionSet()[a_star]) if env_reward == 1: final_score += 1
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=110) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() # [None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def main(): env = FlappyBird() penv = PLE(env, fps=30, display_screen=True,force_fps=True) #penv.init() np.random.seed(0) obs_shape = len(penv.getGameState()) IMG_shape = penv.getScreenGrayscale().shape action_dim = len(penv.getActionSet()) print(obs_shape,action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN( model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.15, # explore 0.1 e_greed_decrement=1e-6 #1e-6 ) # probability of exploring is decreasing during training # 加载模型 if os.path.exists('./dqn_model.ckpt'): save_path = './dqn_model.ckpt' agent.restore(save_path) print("模型加载成功") while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, penv, rpm) max_episode = 1000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, penv, rpm) episode += 1 eval_reward = evaluate(agent, penv) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) # 训练结束,保存模型 save_path = './model/dqn_model_{}_{}.ckpt'.format(episode, eval_reward) agent.save(save_path) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def test(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) obs_dim = len(env.getGameState()) action_dim = 2 # 只能是up键,还有一个其它,所以是2 model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=action_dim, e_greed=0.2, # explore e_greed_decrement=1e-6) if os.path.exists('./model_dir'): agent.restore('./model_dir') # test part, run 5 episodes and average eval_reward = [] for i in range(5): env.init() episode_reward = 0 isOver = False step = 0 while not isOver: if step == 0: reward = env.act(None) done = False else: time.sleep(0.01) #windows running too fast, slow it down obs = list(env.getGameState().values()) action = agent.predict(obs) if action == 1: act = actions["up"] else: act = None reward = env.act(act) isOver = env.game_over() episode_reward += reward step += 1 eval_reward.append(episode_reward) if step > MAX_STEP: break env.reset_game() return np.mean(eval_reward)
def train(): # 创建环境 game = FlappyBird() env_1 = PLE(game, fps=30, display_screen=False) env_2 = PLE(game, fps=30, display_screen=True) obs_dim = len(env_1.getGameState()) act_dim = len(env_1.getActionSet()) print('action set:', env_1.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './flappybird.ckpt' if os.path.exists(save_path): agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env_1, agent, rpm) max_episode = 2000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train for i in range(0, 100): total_reward, steps = run_episode(env_1, agent, rpm) episode += 1 # test eval_reward, steps = evaluate(env_2, agent) logger.info( '[episode:{}], e_greed:{:.6f}, steps:{}, test_reward:{}'.format( episode, agent.e_greed, steps, eval_reward)) # 保存模型 ckpt = './models/episode_{}.ckpt'.format(episode) agent.save(ckpt) # 训练结束,保存模型 save_path = './flappybird.ckpt' agent.save(save_path)
def test(): game = Snake(600, 600) p = PLE(game, fps=60, state_preprocessor=process_state, force_fps=True, display_screen=True, frame_skip=2, reward_values={ "positive": 100.0, "negative": -50.0, "tick": -0.1, "loss": -70.0, "win": 5.0 }) agent = Agent(alpha=float(sys.argv[1]), gamma=float(sys.argv[2]), n_actions=3, epsilon=0.01, batch_size=100, input_shape=6, epsilon_dec=0.99999, epsilon_end=0.001, memory_size=500000, file_name=sys.argv[3], activations=[str(sys.argv[4]), str(sys.argv[5])]) p.init() agent.load_game() scores = [] for _ in range(200): if p.game_over(): p.reset_game() apples = 0 initial_direction = "Right" while not p.game_over(): old_state = np.array( vision(list(p.getGameState()[0]), initial_direction)) action = agent.choose_action(old_state) possible_directions = prepare_corect_directions(initial_direction) possible_directions_tuples = list( zip(possible_directions.keys(), possible_directions.values())) direction = possible_directions_tuples[action] initial_direction = direction[1] reward = p.act(direction[0]) if reward > 50.0: apples += reward scores.append(apples) return scores
def main(train=False): # Don't modify anything in this function. # See the constants defined at the top of this file if you'd like to # change the FPS, screen size, or round length game = Pong(width=WIDTH, height=HEIGHT, MAX_SCORE=MAX_SCORE) if train: p = PLE(game, fps=FPS, display_screen=False, force_fps=True) else: p = PLE(game, fps=FPS, display_screen=True, force_fps=False) p.init() agent_rounds = 0 cpu_rounds = 0 agent_score = 0 cpu_score = 0 num_frames = 0 while True: if p.game_over(): if game.score_counts['agent'] > game.score_counts['cpu']: agent_rounds += 1 print('AGENT won round') else: cpu_rounds += 1 print('CPU won round') if agent_rounds == NUM_ROUNDS or cpu_rounds == NUM_ROUNDS: break p.reset_game() obs = p.getGameState() action = agent(normalize(obs)) reward = p.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 print('AGENT scored') elif reward < 0: cpu_score += 1 print('CPU scored') num_frames += 1 winner = 'AGENT' if agent_rounds > cpu_rounds else 'CPU' print('Winner:', winner) print('Num frames :', num_frames) print('AGENT rounds won:',agent_rounds) print('CPU rounds won:',cpu_rounds) print('AGENT total score:',agent_score) print('CPU total score:',cpu_score)
def main(): env = PLE(Pixelcopter(), fps=30, display_screen=True, state_preprocessor=None) action_dim = len(env.getActionSet()) obs_shape = len(env.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=action_dim) algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 30000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward, max_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 logger.info( 'episode:{} e_greed:{} test_reward:{} max_reward:{}'.format( episode, agent.e_greed, eval_reward, max_reward)) # 训练结束,保存模型 save_path = './dqn_model.ckpt' agent.save(save_path)
def view_agent(agent): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() for i in range(200000): if p.game_over(): p.reset_game() time.sleep(0.03) action = agent.pick_action(p.getGameState()) p.act(action) if p.game_over(): break
def main(): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env.init() action_dim = len(env.getActionSet()) obs_shape = len(env.getGameState()) rpm = ReplayMemory(MEMORY_SIZE) model = FlappyBirdModel(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = FlappyBirdAgent( algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.2, e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 50000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 100): total_reward = run_episode(agent, env, rpm) episode += 1 # evaluation part eval_reward = evaluate(agent, env) logger.info('episode:{} test_reward:{}'.format(episode, eval_reward)) # learning rate adjustment if episode % 100 == 0: if algorithm.lr >= 5e-4: algorithm.lr *= 0.995 if algorithm.lr <= 5e-4 and algorithm.lr >= 1e-4: algorithm.lr *= 0.99 print('learning rate:', algorithm.lr) # save model save_path = './fb_dqn_model.ckpt' agent.save(save_path)
def __init__(self, game, size=10, mutationrate=0.1, crossoverrate=0.1): p = PLE(game, fps=30, display_screen=False) p.init() self.population = np.array([ NNAgent(inputs=p.getGameState().keys(), actions=p.getActionSet()) for i in range(size) ]) for agent in self.population: agent.fitness = None self.fitness = [] * size self.generation = 0 self.popsize = size self.mutationrate = mutationrate self.crossoverrate = crossoverrate
def main(): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env_evaluate = PLE(game, fps=30, display_screen=False) obs_dim = len(env.getGameState()) action_dim = 2 # 只能是up键,还有一个其它,所以是2 # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=action_dim, e_greed=0.2, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training if os.path.exists('./model_dir'): agent.restore('./model_dir') # while rpm.size() < MEMORY_WARMUP_SIZE: # warm up replay memory while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 5000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(agent, env_evaluate) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) agent.save('./model_dir')
def play(file_name, number_of_games=1): game = FlappyBird(width=game_width, height=game_height, pipe_gap=game_pipe_gap) p = PLE(game, display_screen=True, force_fps=False, frame_skip=6) p.init() network = Network() network.load(file_name, rename=False) for i in range(number_of_games): if i > 0: p.reset_game() while not p.game_over(): state = p.getGameState() actions_q_values = network.Q(state).tolist() action_taken_index = np.argmax(actions_q_values) p.act(None if action_taken_index == 0 else 119)
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.00) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
def main(w, seed=SEED, headless=False): """ Let an agent play flappy bird """ if headless: display_screen = False force_fps = True else: display_screen = True force_fps = False game = PLE(FLAPPYBIRD, display_screen=display_screen, force_fps=force_fps, rng=seed) game.init() game.reset_game() FLAPPYBIRD.rng.seed(seed) agent_score = 0 num_frames = 0 while True: if game.game_over(): break obs = game.getGameState() x = normalize(obs) action = agent(x, w) reward = game.act(ACTION_MAP[action]) if reward > 0: agent_score += 1 num_frames += 1 print('Frames :', num_frames) print('Score :', agent_score)
def play_with_saved_agent(agent_file_path, agent_file_name, test_rounds=20): game = FlappyBird() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = load_agent(env, agent_file_path, agent_file_name) env.init() print "Testing model:", agent_file_name total_reward = 0.0 for _ in range(test_rounds): my_agent.start_episode() episode_reward = 0.0 while env.game_over() == False: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward print "Agent score {:0.1f} reward for episode.".format(episode_reward) total_reward += episode_reward my_agent.end_episode() return total_reward/test_rounds
#coding:utf-8 from ple.games.pong import Pong from ple import PLE import numpy as np def get_obs(env): # game_state = env.getGameState() # obs = list(game_state.values()) obs = env.getScreenGrayscale()/255.0 return obs.astype(np.float).ravel() if __name__ == '__main__': game = Pong(width=128, height=96,MAX_SCORE=11) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) p.getScreenGrayscale() game_state = p.getGameState() print(game_state)
if __name__ == "__main__": game = Catcher(width=320, height=320) env = PLE(game, display_screen=True, state_preprocessor=process_state) agent = DQNAgent(env) agent.load("./save/catcher.h5") #초기화 #pylab.title("reward") #pylab.xlabel("episodes") #pylab.ylabel("rewards") env.init() scores, time = [], [] for e in range(EPISODES): env.reset_game() state = env.getGameState() state = np.array([list(state[0])]) score = 0 for time_t in range(20000): action = agent.act(state) reward = env.act(action) #액션 선택 score += reward next_state = env.getGameState() next_state = np.array([list(next_state[0])]) action = [K_a, None, K_d].index(action) agent.remember(state, action, reward, next_state, env.game_over()) state = next_state
def agent_training(agent_file_path, agent_file_name, fig_path, num_steps_train_total = 5000): # training parameters num_epochs = 5 num_steps_train_epoch = num_steps_train_total/num_epochs # steps per epoch of training num_steps_test = 100 update_frequency = 10 # step frequency of model training/updates epsilon = 0.15 # percentage of time we perform a random action, help exploration. epsilon_steps = 1000 # decay steps epsilon_min = 0.1 epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # memory settings max_memory_size = 10000 min_memory_size = 60 # number needed before model training starts game = RunningMinion() env = PLE(game, fps=30, display_screen=True, force_fps=True, state_preprocessor=process_state) my_agent = init_agent(env) memory = utils.ReplayMemory(max_memory_size, min_memory_size) env.init() # Logging configuration and figure plotting logging.basicConfig(filename='../learning.log', filemode='w', level=logging.DEBUG, format='%(levelname)s:%(message)s') logging.info('========================================================') logging.info('Training started for total training steps: '+str(num_steps_train_total)+'.\n') learning_rewards = [0] testing_rewards = [0] for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train_epoch: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_train_epoch: state = env.getGameState() reward, action = my_agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(my_agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if steps < num_steps_train_epoch: learning_rewards.append(episode_reward) if num_episodes % 5 == 0: # print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Train Epoch {:02d}: Epsilon {:0.4f} | Avg. Loss {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, epsilon, np.mean(losses), np.sum(rewards) / num_episodes)) steps, num_episodes = 0, 0 losses, rewards = [], [] # testing loop while steps < num_steps_test: episode_reward = 0.0 my_agent.start_episode() while env.game_over() == False and steps < num_steps_test: state = env.getGameState() reward, action = my_agent.act(state, epsilon=0.05) episode_reward += reward testing_rewards.append(testing_rewards[-1]+reward) steps += 1 # done watching after 500 steps. if steps > 500: env.display_screen = False if num_episodes % 5 == 0: logging.info("Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward)) if steps < num_steps_test: testing_rewards.append(episode_reward) rewards.append(episode_reward) num_episodes += 1 my_agent.end_episode() logging.info("Test Epoch {:02d}: Best Reward {:0.3f} | Avg. Reward {:0.3f}\n" .format(epoch, np.max(rewards), np.sum(rewards) / num_episodes)) logging.info("Training complete.\n\n") plot_figure(fig_path, learning_rewards, 'reward', 'reward_in_training', num_steps_train_total) plot_figure(fig_path, testing_rewards, 'reward', 'reward_in_testing', num_steps_train_total) save_agent(my_agent, agent_file_path, agent_file_name)
memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs+1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False #training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState() reward, action = agent.act(state, epsilon=epsilon) memory.add([ state, action, reward, env.game_over() ]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if num_episodes % 5 == 0:
memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState() reward, action = agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if num_episodes % 5 == 0: print "Episode {:01d}: Reward {:0.1f}".format(
import numpy as np from flappy_bird_model import FlappyBirdModel from flappy_bird_agent import FlappyBirdAgent from ple.games.flappybird import FlappyBird from ple import PLE LEARNING_RATE = 0.001 GAMMA = 0.99 game = FlappyBird() env = PLE(game, fps=30, display_screen=False) action_dim = len(env.getActionSet()) obs_shape = len(env.getGameState()) model = FlappyBirdModel(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = FlappyBirdAgent(algorithm, obs_dim=obs_shape, act_dim=action_dim, e_greed=0.2, e_greed_decrement=1e-6) # load model save_path = './fb_dqn_model.ckpt' agent.restore(save_path)
class Agent: LEARNING_RATE = 1e-6 BATCH_SIZE = 32 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 INITIAL_IMAGES = np.zeros((80, 80, 4)) # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.model = Model(self.OUTPUT_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.OUTPUT_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.trainable = tf.trainable_variables() self.rewards = [] def _assign(self): for i in range(len(self.trainable) // 2): assign_op = self.trainable[i + len(self.trainable) // 2].assign( self.trainable[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _get_image(self, image): r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size=(80, 80)) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) Q_new_negative = sess.run( self.model_negative, feed_dict={self.model_negative.X: new_states}) replay_size = len(replay) X = np.empty((replay_size, 80, 80, 4)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * Q_new_negative[ i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.model.logits, feed_dict={self.model.X: inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for k in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:, :, k] = state dead = False while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign() action = self._select_action(self.INITIAL_IMAGES) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() state = self._get_image(self.env.getScreenRGB()) new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis=2) dead = self.env.game_over() self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={ self.X: X, self.Y: Y }) self.INITIAL_IMAGES = new_state self.T_COPY += 1 self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + ( 1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 INITIAL_FEATURES = np.zeros((4, INPUT_SIZE)) MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.actor = Actor('actor', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories_and_train(self, replay): # state_r, action_r, reward_r, new_state_r, dead_r = replay # train actor states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states, self.actor.hidden_layer:init_values}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states, self.actor_target.hidden_layer:init_values}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values, self.critic.Y:Q}) self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor.hidden_layer:init_values, self.actor_critic_grad:grads}) # train critic rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states, self.critic_target.hidden_layer: init_values, self.critic_target.Y:Q_target}) for i in range(len(replay)): if not replay[0][-1]: rewards[i,0] += self.GAMMA * rewards_target cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer), feed_dict={self.critic.X:states, self.critic.hidden_layer: init_values, self.critic.Y:Q, self.critic.REWARD:rewards}) return cost def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() dead = False init_value = np.zeros((1, 2 * 512)) state = self.get_state() for i in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[i,:] = state while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor', 'actor-target') self._assign('critic', 'critic-target') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = sess.run(self.model.logits, self.model.last_state, feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_values}) action, init_value = np.argmax(action[0]), last_state[0] real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = np.append(self.get_state(), self.INITIAL_FEATURES[:3, :], axis = 0) dead = self.env.game_over() self._memorize(state, action, reward, new_state, dead, init_value) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) self.T_COPY += 1 self.rewards.append(total_reward) if (i+1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) input_layer = tf.Variable(tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE])) bias = tf.Variable(tf.random_normal([self.LAYER_SIZE])) output_layer = tf.Variable(tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE])) feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias) self.logits = tf.matmul(feed_forward, output_layer) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) replay_size = len(replay) X = np.empty((replay_size, self.INPUT_SIZE)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, done_r = replay[i] target = Q[i] target[action_r] = reward_r if not done_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X:inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() done = False while not done: state = self.get_state() action = self._select_action(state) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() done = self.env.game_over() self._memorize(state, action, reward, new_state, done) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={self.X: X, self.Y:Y}) self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)