def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load('model95000') batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) # remember and replay agent.remember(state, action, reward, next_state, p.game_over()) if len(agent.memory) > batch_size: agent.replay(batch_size) state = next_state # save Model if i % 5000 == 0: print("Updating weights") agent.save('newmodel' + str(i)) agent.target_model.set_weights(agent.model.get_weights()) # Plot socre if i % 1000 == 0: plot(data)
def main(argv): try: opts, _ = getopt.getopt(argv, "hr") except getopt.GetoptError: print("birdML.py [-h | -r]") sys.exit(2) record = False for opt, arg in opts: if opt == '-h': print("-h to help") print("-r record") elif opt == '-r': record = True netb = netBrain() netb.summary() game = FlappyBird() p = PLE(game, fps=30, display_screen=True, force_fps=True) p.init() actions = p.getActionSet() out = 1 epochs = 50 for i in range(epochs): lstates = [] rewards = [] if record: fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter('Videos/test_' + str(i) + '.mov', fourcc, 30.0, (288, 512)) for d in range(10): while not p.game_over(): if record: obs = p.getScreenRGB() obs = cv2.transpose(obs) obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR) out.write(obs) st = game.getGameState() gstate = list(st.values()) gstate = np.array([np.array(gstate)]) lstates.append(gstate[0]) pred = netb.predict(gstate)[0] a = pred.argmax() p.act(actions[a]) if st['next_pipe_bottom_y'] < st['player_y']: pred[0] = 1.0 pred[1] = 0.0 elif st['next_pipe_top_y'] > st['player_y']: pred[0] = 0.0 pred[1] = 1.0 rewards.append(pred) p.reset_game() netb.fit(np.array(lstates), np.array(rewards), batch_size=10, epochs=10) if record: out.release()
def train(FRAME_TRAIN=1000005): game = FlappyBird() p = PLE(game, fps=30, display_screen=True) p.init() ob = game.getGameState() state = ob state = np.reshape(np.asarray(list(state.values())), [1, 8]) total_reward = 0 agent = DDQN_Agent.DeepQAgent() agent.load("model95000") batch_size = 32 my_timer = time.time() prev_frame = 0 data = [] for i in range(FRAME_TRAIN): if p.game_over(): data.append(total_reward) p.reset_game() print( "Total reward = {}, Frame = {}, epsilon = {}, frame/second = {}" .format(total_reward, i, agent.epsilon, (i - prev_frame) / (time.time() - my_timer))) total_reward = 0 prev_frame = i my_timer = time.time() # get action from agent action = agent.act(state) # take action reward = p.act(p.getActionSet()[action]) # making the reward space less sparse if reward < 0: reward = -1 total_reward += reward next_state = np.asarray(list(game.getGameState().values())) next_state = np.reshape(next_state, [1, 8]) state = next_state # time.sleep(0.3) # Plot socre if i % 1000 == 0: plot(data)
def play(self, fast=True): """Use athlete to play. Args: fast <bool>: set to True if the screen should be hidden and speed enhanced """ game = FlappyBird() env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=fast, display_screen=not fast) env.init() pipes = [] i = 0 while i < 100: env.reset_game() pipes.append(0) while not env.game_over(): A = self.act(game.getGameState()) r = env.act(ACTIONS[A]) if r == 1.: pipes[-1] += 1 if not fast: print('\n- Score: {} pipes'.format(pipes[-1])) print('- Played {} games'.format(len(pipes))) print('- Average score: {} pipes'.format(np.round(np.mean(pipes), decimals=1))) else: i += 1 print('\n- Max score: {} pipes'.format(np.max(pipes))) print('- Games < 15 pipes: {}'.format( len(tuple(filter(lambda x: x < 15, pipes))) )) print('- Played {} games'.format(100)) print('- Average score: {} pipes'.format( np.round(np.mean(pipes), decimals=1)) )
def test_model_G(nb_games, model): game = FlappyBird( graphics="fixed" ) # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p.init() reward = 0.0 cumulated = np.zeros((nb_games)) list_actions = [0, 119] for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen_x = process_screen(p.getScreenRGB()) stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4) x = np.stack(stacked_x, axis=-1) action = list_actions[np.argmax( model.predict(np.expand_dims(x, axis=0)))] reward = p.act(action) cumulated[i] = cumulated[i] + reward avg_score = np.mean(cumulated) print('Average : ' + str(avg_score)) mx_score = np.max(cumulated) print('Max : ' + str(mx_score)) return avg_score, mx_score
def test(): game2 = FlappyBird() p2 = PLE(game2, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) p2.init() reward = 0.0 nb_games = 10 cumulated = np.zeros((nb_games)) for i in range(nb_games): p2.reset_game() while (not p2.game_over()): state = game2.getGameState() screen = p2.getScreenRGB() action = FlappyPolicy(state, screen) reward = p2.act(action) cumulated[i] = cumulated[i] + reward return np.mean(cumulated)
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
game_current_state[2], action] + self._alpha * (reward + self._gamma * np.max( self.Q_values[game_next_state[0], game_next_state[1], game_next_state[2]])) if __name__ == "__main__": game = FlappyBird() p = PLE(game, fps=30, display_screen=True) #creating a QlAgent class object agent = QLAgent(flappy_actions=p.getActionSet(), grid_size=10) p.init() #get the current state values(state array) game_current_state = agent.get_current_state(game.getGameState()) #initializing the episode to 0 number_of_episods = 0 #initializing the maximum score variable to 0 maximum_score = 0 #creating a while loop to itaraqte through the episodes while True: #get the optimal action to the current state and store in in the variable maximum_action = agent.get_action(game_current_state) #get the score in the current episode current_score = p.score() #get the maximim score by comparing with the current acore maximum_score = max(current_score, maximum_score) #get the reward value by performing the above action (rward is either 1 or -1000) reward = agent.perform_action(p, maximum_action)
for ea in partie: old_state, action, reward, futur_state = ea # Mise à jour Offline # Formule du cours: Q(s,a) = Q(s,a) + alpha*(R + gamma* max(Q(s',a)) - Q(s,a)) Q_function[old_state[0]][old_state[1]][ old_state[2]][action] = Q_function[old_state[0]][old_state[1]][ old_state[2]][action] + alpha * (reward + gamma * max( Q_function[futur_state[0]][futur_state[1]][ futur_state[2]]) - Q_function[old_state[0]][ old_state[1]][old_state[2]][action]) partie = [] p.reset_game() state = game.getGameState() RS = reduce_state(state) else: # Pour la première partie partie = [] p.reset_game() state = game.getGameState() RS = reduce_state(state) while (not p.game_over()): epsilon = np.random.uniform(0, 101) if epsilon > epsilon_act: #Q-greedy qval = Q_function[RS[0]][RS[1]][RS[2]]
def update_Q(self, s, s_prime, reward, action): self.Q[s[0], s[1], s[2], action] = (1 - self._alpha) * self.Q[ s[0], s[1], s[2], action] + self._alpha * ( reward + self._lambda * np.max(self.Q[s_prime[0], s_prime[1], s_prime[2]])) if __name__ == "__main__": game = FlappyBird() p = PLE(game, fps=30, display_screen=True) agent = Agent(action_space=p.getActionSet(), grid_size=10) p.init() s = agent.get_current_state(game.getGameState()) episodes = 0 max_score = 0 while True: # Find the optimal action based on the current state max_action = agent.optimal_action(s) current_score = p.score() max_score = max(current_score, max_score) # Perform the optimal action and return the reward reward = agent.act(p, max_action) # Get the next game state after performing the optimal action s_prime = agent.get_current_state(game.getGameState())
class GymFlappy(gym.Env, EzPickle): def __init__(self, config=None): EzPickle.__init__(self) # Aid options self.pre_play = True self.force_calm = False self.positive_counts = 0 self.display_screen = False if config: self.display_screen = config['display_screen'] self.observation_space = spaces.Box(0, 1, shape=(8, ), dtype=np.float32) self.action_space = weightedDiscrete(2) #spaces.Discrete(2) self.vel_max = 15 self.vel_min = -15 self.dist_max = 500 self.dist_min = 0 self.y_max = 500 self.y_min = 0 self.game = FlappyBird(graphics="fancy") self.p = PLE(self.game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=self.display_screen, rng=0) self.p.rng = self.game.rng self.game.player.rng = self.game.rng self.p.init() self.current_t = 0 self.max_t = 1000 def _get_obs(self): state = self.game.getGameState() obs = np.empty((8, )) obs[0] = (state["player_y"] - self.y_min) / (self.y_max - self.y_min) obs[1] = (state["next_pipe_dist_to_player"] - self.dist_min) / (self.dist_max - self.dist_min) obs[2] = (state["next_pipe_top_y"] - self.y_min) / (self.y_max - self.y_min) obs[3] = (state["next_pipe_bottom_y"] - self.y_min) / (self.y_max - self.y_min) obs[4] = (state["next_next_pipe_dist_to_player"] - self.dist_min) / (self.dist_max - self.dist_min) obs[5] = (state["next_next_pipe_top_y"] - self.y_min) / (self.y_max - self.y_min) obs[6] = (state["next_next_pipe_bottom_y"] - self.y_min) / (self.y_max - self.y_min) obs[7] = (state["player_vel"] - self.vel_min) / (self.vel_max - self.vel_min) return obs def reset(self): self.current_t = 0 self.p.reset_game() if self.pre_play: # Get rid of the first second of game ini_fc = self.force_calm self.force_calm = False for i in range(25): a = 0 if i % 10 == 0: a = 1 self.step(np.array([a])) self.force_calm = ini_fc return self._get_obs() def step(self, action): self.current_t += 1 reward = self.p.act(119 if action == 1 else 0) if self.force_calm: # ensures each action is followed by no action for i in range(1): r = self.p.act(0) reward += r done = self.current_t >= self.max_t or self.p.game_over() done = done or self._double_check_done() info = {} return self._get_obs(), reward, done, info def __getstate__(self): dc = lambda x: copy.deepcopy(x) # get all game attributes _game_state = self.game.__dict__ _player_state = self.game.player.__dict__ _pipe_state = self.game.pipe_group.__dict__ pipe_sprites = self.game.pipe_group.spritedict pipe_xs = [] pipe_ys = [] pipe_rects = [] for _, sprite in enumerate(pipe_sprites): pipe_xs.append(dc(sprite.x)) pipe_ys.append(dc(sprite.gap_start)) pipe_rects.append(dc(pipe_sprites[sprite])) lives = dc(self.game.lives) score = dc(self.game.getScore()) pscore = dc(self.p.previous_score) # remove images (heavy and require additional serialization): __game_state = {} __player_state = {} for attr in _game_state: if attr in [ 'screen', 'images', 'clock', 'player', 'backdrop', "pipe_group" ]: pass else: __game_state[attr] = _game_state[attr] for attr in _player_state: if attr in ['image', 'image_assets']: pass else: __player_state[attr] = _player_state[attr] # accomodate multiple envs in parallel game_state = dc(__game_state) player_state = dc(__player_state) pipe_state = _pipe_state # this is a non-PLE parameter that needs to be reset too envtime = dc(self.current_t) rng_state = self.game.rng.get_state() stategroup = (game_state, player_state, pipe_state, (pipe_xs, pipe_rects, pipe_ys), lives, envtime, rng_state, score, pscore) return stategroup def __setstate__(self, stategroup): ''' Stategroup required (ugly yet somewhat functional): 0 game_state dictionary (game.__dict__) 1 player_state dictionary (game.player.__dict__) 2 pipe_state idctionary (game.pipe_group.__dict__) 3 x positions of pipes in game (list) 4 lives (game.lives, used in game.game_over()) 5 current time (self.current_t) 6 rng state ''' # use update to preserve images we didn't save self.game.__dict__.update(stategroup[0]) self.game.player.__dict__.update(stategroup[1]) #self.game.pipe_group.__dict__.update(stategroup[2]) # was introducing reference crossing pipe_sprites = self.game.pipe_group.spritedict for i, sprite in enumerate(pipe_sprites): sprite.x = stategroup[3][0][i] pipe_sprites[sprite] = stategroup[3][1][i] sprite.gap_start = stategroup[3][2][i] self.game.lives = stategroup[4] # prevent Gym env to return false dones self.current_t = stategroup[5] self.game.rng.set_state(stategroup[6]) # fix stupid reward self.game.score = stategroup[7] self.p.previous_score = stategroup[8] return self._get_obs() def get_state(self): return self.__getstate__() def set_state(self, state): return self.__setstate__(state) def reset_counts(self): self.positive_counts = 0 def _double_check_done(self): ''' Manually inspects game to detect collisions Worthy of suicide but necessary... ''' # Check pipe collisions for p in self.game.pipe_group: hit = pygame.sprite.spritecollide(self.game.player, self.game.pipe_group, False) is_in_pipe = (p.x - p.width / 2 - 20) <= self.game.player.pos_x < (p.x + p.width / 2) for h in hit: # do check to see if its within the gap. top_pipe_check = ( (self.game.player.pos_y - self.game.player.height / 2 + 12) <= h.gap_start) and is_in_pipe bot_pipe_check = ( (self.game.player.pos_y + self.game.player.height) > h.gap_start + self.game.pipe_gap) and is_in_pipe boom = bot_pipe_check or top_pipe_check if boom: return True # floor limit if self.game.player.pos_y >= 0.79 * self.game.height - self.game.player.height: return True # went above the screen if self.game.player.pos_y <= 0: return True return False
batchSize = 256 # mini batch size jeu = FlappyBird() p = PLE(jeu, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) p.init() i = 0 while (True): p.reset_game() state = jeu.getGameState() state = np.array(list(state.values())) while (not jeu.game_over()): qval = model.predict( state.reshape(1, len(state)), batch_size=batchSize ) #Learn Q (Q-learning) / model initialise avant (neural-network) if (random.random() < epsilon): # exploration exploitation strategy action = np.random.randint(0, 2) else: #choose best action from Q(s,a) values qval_av_action = [-9999] * 2 for ac in range(0, 2): qval_av_action[ac] = qval[0][ac] action = (np.argmax(qval_av_action)) #Take action, observe new state S'
def train(self, episodes=1000): """Train the athlete. Args: episodes <int>: number of episodes to iterate over """ # Initialize exploration only data epsilon = 1 epsilon_decay = 1 / episodes jumprate = 0.1 # For log purposes only self.print_data = dict({ 'hits': 0, 'games_played': 1, 'below_15': 0, 'pipes_below_15': 0, 'ep': 0, 'pipes': 0, 'episodes': episodes }) # Start game game = FlappyBird() env = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) env.init() for _ in range(episodes): self.print_data['ep'] += 1 self.print_data['pipes'] = 0 # Reset game env.reset_game() S = self.state2coord(game.getGameState()) while not env.game_over(): # Has the state been visited already ? if self.Q.get(S) is None: self.Q[S] = [0, 0] # Exploration if rd() < epsilon: # Using a jump rate to orient exploration A = UP if rd() < jumprate else DOWN else: # Reinforcement A = np.argmax(self.Q.get(S)) # Perform action and get reward r = env.act(ACTIONS[A]) if r == 1.0: # For log purposes only self.print_data['pipes'] += 1 # Biase reward to orient exploration R = self.biase_reward(r) S_ = self.state2coord(game.getGameState()) # Perform Q update self.update_q(S, A, R, S_) # Change state S = S_ # Decrease exploration rate epsilon -= epsilon_decay # For log purposes only self.print_status()
qvalues = json.load(fil) fil.close() #Loads the FlappyBird game game = FlappyBird(graphics="fixed") p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) p.init() #Repeat the game nb_games times for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() bucle += 1 current_state = etat(state) #As games are played, epsilon decreases if bucle % 100 == 0: epsilon = epsilon * 0.9 #Two options: epsilon < random implies deciding the following action from the qvalues if (epsilon < rd.random()): action_index = np.argmax(qvalues[current_state]) action = action_index*119 #Otherwise: action is random. Therefore, as epsilon decreases, qvalues are more often selected to decide the action else:
def update(self, action, reward, observation, episode_over): if episode_over: future = -5 else: future = np.max(self.q[observation]) # print "Old Q value [", self.state, action, "] = ", self.q[self.state][action] self.q[self.state][action] += self.config["learning_rate"] * (reward + self.config["discount"] * future - self.q[self.state][action]) # print "New Q value [", self.state, action, "] = ", self.q[self.state][action] self.state = observation game = FlappyBird() p = PLE(game, fps=30, display_screen=True) agent = TabularQAgent(action_space=p.getActionSet()) # print "action set = ", p.getActionSet() p.init() observation = game.getGameState() observation = ((int(observation["player_y"]) - int(observation["next_pipe_bottom_y"])), int(observation["next_pipe_dist_to_player"]), int(observation["player_vel"])) agent.state = observation max_score = -10 episode_count = 0 output = open("out.txt", "w") frame_count = 0 batch_sum = 0 # print "Initial State: ", observation while True: frame_count += 1 episode_over = False action = agent.pickAction() # print "Action = ", action reward = p.act(p.getActionSet()[action])
myAgent = NaiveAgent(p.getActionSet()) Q = myAgent.createStateActionPolicy(game) # print(Q) # myAgent.setTrainedQTable(Q) starting_episode = 1004 episodes = 5000 # episodes = 1 alpha = 0.1 discount_factor = 0.9 obs = game.getGameState() # print(obs) def sarsa(): # gp = None max_reward = -1000 # with open("output.txt", "a") as file: gp = None for episode in range(starting_episode, episodes): p.reset_game()
else: r = -1000 return r if __name__ == "__main__": episodes = 2000_000000 game = FlappyBird() p = PLE(game, fps=30, display_screen=False) p.init() agent = Agent(p.getActionSet()) max_score = 0 for episode in range(episodes): p.reset_game() state = agent.get_state(game.getGameState()) agent.update_greedy() while True: action = agent.get_best_action(state) reward = agent.act(p, action) next_state = agent.get_state(game.getGameState()) agent.update_q_table(state, action, next_state, reward) current_score = p.score() state = next_state if p.game_over(): max_score = max(current_score, max_score) print('Episodes: %s, Current score: %s, Max score: %s' % (episode, current_score, max_score)) if current_score > 300: np.save("{}_{}.npy".format(current_score, episode), agent.q_table) break
import DFS.DFS as dfs game = FlappyBird() game.pipe_gap = 150 p = PLE(game, fps=30, display_screen=True, force_fps=False) p.init() print(p.getActionSet()) flappyVariables = { "player_height": game.player.height, "pipe_gap": game.pipe_gap, "game_max_drop": game.player.MAX_DROP_SPEED, "game_gravity": game.player.GRAVITY, "game_flap_power": game.player.FLAP_POWER } myAgent = SimpleAgent(flappyVariables) nb_frames = 1000 for f in range(nb_frames): if p.game_over(): #check if the game is over exit() p.reset_game() obs = p.getScreenRGB() # if f == 1 : # steps = dfs.get_steps_by_frame(game.getGameState()); # print("\n STEPS",steps) action = myAgent.chooseAction(game.getGameState()) p.act(action) # p.act(None)
scoreMC = np.zeros((nb_epochs)) # Enregistrement du réseau de neurones filename = "dqn_3_" """-----------------""" """ Deep Q-Learning """ """-----------------""" for id_game in range(total_games): if id_game % evaluation_period == 0: epoch += 1 scoreMC[epoch] = MCeval(dqn, 50, gamma) dqn.save(filename + str(epoch) + ".dqf") print(">>> Eval n°%d | score = %f" % (epoch, scoreMC[epoch])) p.reset_game() # Nouvelle partie state_x = process_state(game.getGameState()) id_frame = 0 score = 0 alea = 0 while not game.game_over(): id_frame += 1 step += 1 ## Choisit l'action à effectuer : 0 ou 1 if np.random.rand() < epsilon(step): # Action au hasard alea += 1 action = np.random.choice([0, 1]) else: # Meilleure action possible action = greedy_action(dqn, state_x) ## Joue l'action et observe le gain et l'état suivant reward = p.act(actions[action]) reward = clip_reward(reward)