def run_a_game(self,game): from ple import PLE p = PLE(game,display_screen=True) agent = NaiveAgent(p.getActionSet()) p.init() reward = p.act(p.NOOP) for i in range(NUM_STEPS): obs = p.getScreenRGB() reward = p.act(agent.pickAction(reward,obs))
class Env: def __init__(self): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=True) self.env.init() self.env.getGameState = self.game.getGameState # maybe not necessary # by convention we want to use (0,1) # but the game uses (None, 119) self.action_map = self.env.getActionSet() #[None, 119] def step(self, action): action = self.action_map[action] reward = self.env.act(action) done = self.env.game_over() obs = self.get_observation() # don't bother returning an info dictionary like gym return obs, reward, done def reset(self): self.env.reset_game() return self.get_observation() def get_observation(self): # game state returns a dictionary which describes # the meaning of each value # we only want the values obs = self.env.getGameState() return np.array(list(obs.values())) def set_display(self, boolean_value): self.env.display_screen = boolean_value
def main_naive(): game = FlappyBird() env = PLE(game, fps=30, display_screen=True) my_agent = naive.NaiveAgent(allowed_actions=env.getActionSet()) env.init() reward = 0.0 nb_frames = 10000 for i in range(nb_frames): if env.game_over(): env.reset_game() observation = env.getScreenRGB() action = my_agent.pickAction(reward, observation) reward = env.act(action)
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() state = game.getGameState() state_arr = self.state_to_arr(state) # state_arr = self.scaler.transform(state_arr.reshape(1, -1)) gscore = 0 nb_games += 1 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save( chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games)) nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) print('WEIGHTS ABS MEAN') print(abs(np.mean(self.model.get_weights()[0], axis=1))) # 1) In s, choose a (GLIE actor) qvals = self.get_qvals(state) act = self.greedy_action(qvals, self.epsilon) # 2) Observe r, s′ bare_reward = p.act(ACTIONS[act]) reward = self.reward_engineering(bare_reward) new_state = game.getGameState() new_state_arr = self.state_to_arr(state) self.replay_memory.append( (state_arr, act, reward, new_state_arr)) if (len(self.replay_memory) == self.BUFFER_SIZE and step % self.TRAIN_FREQ == 0): X_train = [] y_train = [] # TEST: TRAIN ONLY WITH A SMALL BUFFER BATCH replay_memory_copy = list(self.replay_memory)[:] random.shuffle(replay_memory_copy) for frame in replay_memory_copy[:self.BATCH_SIZE]: s_arr_1, act_x, bare_reward_x, s_arr_2 = frame reward_x = self.reward_engineering(bare_reward_x) old_qval = self.model.predict(s_arr_1, batch_size=1) qval_new = self.model.predict(s_arr_2, batch_size=1) max_qval = np.max(qval_new) # terminal state if bare_reward < 0: delta = reward_x else: delta = reward_x + self.GAMMA * max_qval y = np.zeros((1, len(ACTIONS))) y[0][:] = old_qval[0][:] y[0][act_x] = old_qval[0][act_x] + self.ALPHA * delta X_train.append(s_arr_1.reshape(len(STATES), )) y_train.append(y.reshape(len(ACTIONS), )) X_train = np.array(X_train) y_train = np.array(y_train) self.model.fit(X_train, y_train, batch_size=self.BATCH_SIZE, epochs=2, verbose=False) # 5) s <- s' state = new_state state_arr = new_state_arr if bare_reward > 0: gscore += 1 scores.append(gscore) self.save(chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games))
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.trainable = tf.trainable_variables() self.rewards = [] def _assign(self): for i in range(len(self.trainable)//2): assign_op = self.trainable[i+len(self.trainable)//2].assign(self.trainable[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) Q_new_negative = sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states}) replay_size = len(replay) X = np.empty((replay_size, self.INPUT_SIZE)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, done_r = replay[i] target = Q[i] target[action_r] = reward_r if not done_r: target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.model.logits, feed_dict={self.model.X:inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() done = False while not done: if (self.T_COPY + 1) % self.COPY == 0: self._assign() state = self.get_state() action = self._select_action(state) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() done = self.env.game_over() self._memorize(state, action, reward, new_state, done) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y}) self.T_COPY += 1 self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
qval = Q_function[RS[0]][RS[1]][RS[2]] if ( qval[0] == qval[1] ): #Comme il est possible que lors de l'apprentissage, tous les états n'aient pas été "découvert", et que lors de l'initialisation # de Q_function qval[0]=qval[1], alors si tel est le cas, je choisi de rien faire car il est plus risqué de flappé que de ne rie Action = None else: #choose best action from Q(s,a) values Action = bool_to_act(qval.argmax()) else: #epsilon-greedy Action = bool_to_act(np.random.randint(0, 2)) r = 5 * p.act( Action ) + 1 # fonction reward vaut 1 si le jeu n'est pas fini et 6 si un tuyau a été franchi cumulated[k] = cumulated[k] + (r - 1) / 5 new_state = game.getGameState() ns = reduce_state(new_state) partie.append([ RS, act_to_bool(Action), r, ns ]) #On enregistre l'état, l'action, la récompense et le futur état RS = ns # mise à jour de l'état # save the model every 500 epochs if k % 500 == 0: print("saving model") f_myfile = open('Q_function_avecrandom.pickle', 'wb')
def play(size_image): sess = tf.InteractiveSession() img_size = 80 net = NetworkOld(img_size) # open up a game state to communicate with emulator game = flappybird.prepare_game() p = PLE(game, fps=30, display_screen=True) p.init() reward = 0.0 # get the first state by doing nothing and preprocess the image to 80x80x4 actions = p.getActionSet() p.act(actions[1]) s_t = preprocessing.transform_image(p.getScreenRGB(), img_size) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("../saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training t = 0 while t < MAX_ITE: if p.game_over(): p.reset_game() terminal = True else: terminal = False # choose an action epsilon greedily readout_t = net.readout.eval(feed_dict={net.s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) a_t[action_index] = 1 # run the selected action and observe next state and reward action = int(np.argmax(a_t)) if action == 0: action = 1 else: action = 0 r_t = p.act(actions[action]) s_t1 = preprocessing.transform_image_stacked(p.getScreenRGB(), s_t, img_size) # update the old values s_t = s_t1 t += 1 print("TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t), " / SCORE", p.score())
class PLEEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name, display_screen=True): # set headless mode os.environ['SDL_VIDEODRIVER'] = 'dummy' # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game_state = PLE(game, fps=30, frame_skip=2, display_screen=display_screen) self.game_state.init() self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None self.count = 0 def step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_image() #import scipy.misc #scipy.misc.imsave('outfile'+str(self.count)+'.jpg', state) #self.count = self.count+1 terminal = self.game_state.game_over() #print(randomAction) #print(a,self._action_set[a]) return state, reward, terminal, {} def _get_image(self): #image_rotated = self.game_state.getScreenRGB() image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated @property def n_actions(self): return len(self._action_set) # return: (states, observations) def reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def render(self, mode='human', close=False): #print('HERE') if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
class Agent: GENERATION = 0; AGENT_HISTORY_LENGTH = 1 NUM_OF_ACTIONS = 2 POPULATION_SIZE = 15 EPS_AVG = 1 SIGMA = 0.1 LEARNING_RATE = 0.03 INITIAL_EXPLORATION = 0.0 FINAL_EXPLORATION = 0.0 EXPLORATION_DEC_STEPS = 100000 def __init__(self): self.model = Model() self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=False) self.env.init() self.env.getGameState = self.game.getGameState self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE) self.exploration = self.INITIAL_EXPLORATION def get_predicted_action(self, sequence): prediction = self.model.predict(np.array(sequence)) x = np.argmax(prediction) return 119 if x == 1 else None def load(self, filename='weights.pkl'): with open(filename, 'rb') as fp: self.model.set_weights(pickle.load(fp)) self.es.weights = self.model.get_weights() def get_observation(self): state = self.env.getGameState() return np.array(state.values()) def save(self, filename='weights.pkl'): with open(filename, 'wb') as fp: pickle.dump(self.es.get_weights(), fp) def play(self, episodes): self.env.display_screen = True self.model.set_weights(self.es.weights) for episode in xrange(episodes): self.env.reset_game() observation = self.get_observation() sequence = [observation] * self.AGENT_HISTORY_LENGTH done = False score = 0 while not done: action = self.get_predicted_action(sequence) reward = self.env.act(action) observation = self.get_observation() sequence = sequence[1:] sequence.append(observation) done = self.env.game_over() if self.game.getScore() > score: score = self.game.getScore() self.GENERATION = self.GENERATION + 1 print self.GENERATION print "score: %d" % score self.env.display_screen = False def train(self, iterations): self.es.run(iterations, print_step=10) def get_reward(self, weights): total_reward = 0.0 self.model.set_weights(weights) for episode in xrange(self.EPS_AVG): self.env.reset_game() observation = self.get_observation() sequence = [observation] * self.AGENT_HISTORY_LENGTH done = False while not done: self.exploration = max(self.FINAL_EXPLORATION, self.exploration - self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS) if random.random() < self.exploration: action = random.choice([119, None]) else: action = self.get_predicted_action(sequence) reward = self.env.act(action) reward += random.choice([0.0001, -0.0001]) total_reward += reward observation = self.get_observation() sequence = sequence[1:] sequence.append(observation) done = self.env.game_over() return total_reward / self.EPS_AVG
from ple.games.SpaceInvadersGame import SpaceInvadersGame from ple import PLE game = SpaceInvadersGame() p = PLE(game, fps=30, display_screen=True) #agent = myAgentHere() p.init() reward = 0.0 for i in range(100): if p.game_over(): p.reset_game() observation = p.getScreenRGB() #action = agent.pickAction(reward, observation) allowed_actions = p.getActionSet() reward = p.act(allowed_actions[4])
class MonsterKongEnv(gym.Env): metadata = {'render.modes': ['human']} def __init__(self, map_config): self.map_config = map_config self.game = MonsterKong(self.map_config) self.fps = 30 self.frame_skip = 1 self.num_steps = 1 self.force_fps = True self.display_screen = True self.nb_frames = 500 self.reward = 0.0 self.episode_end_sleep = 0.2 if map_config.has_key('fps'): self.fps = map_config['fps'] if map_config.has_key('frame_skip'): self.frame_skip = map_config['frame_skip'] if map_config.has_key('force_fps'): self.force_fps = map_config['force_fps'] if map_config.has_key('display_screen'): self.display_screen = map_config['display_screen'] if map_config.has_key('episode_length'): self.nb_frames = map_config['episode_length'] if map_config.has_key('episode_end_sleep'): self.episode_end_sleep = map_config['episode_end_sleep'] self.current_step = 0 self._seed() self.p = PLE(self.game, fps=self.fps, frame_skip=self.frame_skip, num_steps=self.num_steps, force_fps=self.force_fps, display_screen=self.display_screen, rng=self.rng) self.p.init() self._action_set = self.p.getActionSet()[1:] self.action_space = spaces.Discrete(len(self._action_set)) (screen_width, screen_height) = self.p.getScreenDims() self.observation_space = spaces.Box(low=0, high=255, shape=(screen_height, screen_width, 3)) def _seed(self, seed=24): self.rng = seed def _step(self, action_taken): reward = 0.0 action = self._action_set[action_taken] reward += self.p.act(action) obs = self.p.getScreenRGB() done = self.p.game_over() info = {'PLE': self.p} self.current_step += 1 if self.current_step >= self.nb_frames: done = True return obs, reward, done, info def _reset(self): self.current_step = 0 # Noop and reset if done start_done = True while start_done: self.p.reset_game() _, _, start_done, _ = self._step(4) #self.p.init() if self.p.display_screen: self._render() if self.episode_end_sleep > 0: time.sleep(self.episode_end_sleep) return self.p.getScreenRGB() def _render(self, mode='human', close=False): if close: return # TODO: implement close original = self.p.display_screen self.p.display_screen = True self.p._draw_frame() self.p.display_screen = original
#处理obs obs = preprocess(obs) episode_reward = 0 while True: # 预测动作,只选最优动作 action = agent.predict(obs) # 图像太快休眠 # time.sleep(0.02) # # 新建窗口显示分数 # observation = env.getScreenRGB() # score = env.score() # # 格式转换 # observation = cv2.cvtColor(observation,cv2.COLOR_RGB2BGR) # # 选择90度 # observation = cv2.transpose(observation) # font = cv2.FONT_HERSHEY_SIMPLEX # observation = cv2.putText(observation, "score:"+str(int(score)), (0, 30), font, 0.6, (0, 0, 255), 2) # cv2.imshow("flappybird", observation) # cv2.waitKey(10) reward= env.act(actionset[action]) obs = list(env.getGameState().values()) #处理obs obs = preprocess(obs) done = env.game_over() episode_reward += reward if done: break print("episode_reward:",episode_reward) cv2.destroyAllWindows()
class Catcher_Env: def __init__(self, random_seed=0, init_lives=3, normalise=True, display=False): self._random_seed = random_seed self._game = Catcher(init_lives=init_lives) self._normalise = normalise self._display = display if self._display == False: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" if self._normalise: self._env = PLE(self._game, fps=30, state_preprocessor=self._normalise_ob, display_screen=display) else: self._env = PLE(self._game, fps=30, state_preprocessor=self._ob, display_screen=display) self._env.init() self._actions = self._env.getActionSet() self._env.rng.seed(random_seed) # Tracker self._cum_reward = 0 def _ob(self, state): return np.array([ state['player_x'], state['player_vel'], state['fruit_x'], state['fruit_y'] ]) def _normalise_ob(self, state): state = np.array([ state['player_x'], state['player_vel'], state['fruit_x'], state['fruit_y'] ]) state[0] = (state[0] - 26) / 26 # makes range -1 1 state[1] = (state[1]) / 8 # makes range -1 1 state[2] = (state[2] - 26) / 26 # makes range -1 1 state[3] = (state[3] - 20) / 45 # makes range -1 1 return state def reset(self): self._cum_reward = 0 self._env.reset_game() return self._env.getGameState() def action_set(self): return self._actions def num_actions(self): return len(self._actions) def episode_return(self): return self._cum_reward def act(self, a): reward = self._env.act(self._actions[a]) if reward == -6: reward = -1 self._cum_reward += reward next_obs = self._env.getGameState() terminal = self._env.game_over() if self._cum_reward >= 200: self._cum_reward = 200 terminal = True return reward, next_obs, terminal
(1, )) # don't quite like this one but... # Ready log logFile = open(log_output, 'w') logFile.write('Step,Episode,Loss,Mean_Reward,Time \n') game, episode_reward, mean_reward = 0, 0, 0 start_time = time.time() # Passes epoch, loss = 0, float('Inf') while epoch < NB_EPOCHS: # select action a = epsilon_greedy_action(model, features, epoch) # get reward r = p.act(actions[a]) episode_reward += r screen_y = process_screen(p.getScreenRGB()) d = p.game_over() replay_memory.append(screen_x, a, r, screen_y, d) # train if epoch > BATCH and epoch % ACCELERATE_TRAINING == 0 and epoch > OBSERVE: X, A, R, Y, D = replay_memory.minibatch(BATCH) QY = model_target.predict(Y) QYmax = QY.max(1).reshape((BATCH, 1)) update = R + GAMMA * (1 - D) * QYmax QX = model.predict(X) QX[np.arange(BATCH), A.ravel()] = update.ravel() loss = float(model.train_on_batch(x=X, y=QX)) # transfert weights between networks
class Agent: LEARNING_RATE = 0.003 EPISODE = 500 LAYER_SIZE = 500 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 INPUT_SIZE = 8 # based on documentation, features got 8 dimensions OUTPUT_SIZE = 2 # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, self.INPUT_SIZE)) self.REWARDS = tf.placeholder(tf.float32, (None)) self.ACTIONS = tf.placeholder(tf.int32, (None)) input_layer = tf.Variable( tf.random_normal([self.INPUT_SIZE, self.LAYER_SIZE])) bias = tf.Variable(tf.random_normal([self.LAYER_SIZE])) output_layer = tf.Variable( tf.random_normal([self.LAYER_SIZE, self.OUTPUT_SIZE])) feed_forward = tf.nn.relu(tf.matmul(self.X, input_layer) + bias) self.logits = tf.nn.softmax(tf.matmul(feed_forward, output_layer)) indexes = tf.range(0, tf.shape(self.logits)[0]) * tf.shape( self.logits)[1] + self.ACTIONS responsible_outputs = tf.gather(tf.reshape(self.logits, [-1]), indexes) self.cost = -tf.reduce_mean(tf.log(responsible_outputs) * self.REWARDS) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X: inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): ep_history = [] for k in range(self.EPISODE): total_reward = 0 self.env.reset_game() done = False state = self.get_state() sequence = [state] while not done: action = self._select_action(state) real_action = 119 if action == 1 else None reward = self.env.act(real_action) reward += random.choice([0.0001, -0.0001]) total_reward += reward next_state = self.get_state() ep_history.append( [state, action, total_reward, next_state]) state = next_state sequence = [state] done = self.env.game_over() ep_history = np.array(ep_history) ep_history[:, 2] = discount_rewards(ep_history[:, 2]) sess.run(self.optimizer, feed_dict={ self.X: np.vstack(ep_history[:, 0]), self.REWARDS: ep_history[:, 2], self.ACTIONS: ep_history[:, 1] }) self.rewards.append(total_reward) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint) def play(self, debug=False, not_realtime=False): total_reward = 0.0 current_reward = 0 self.env.force_fps = not_realtime self.env.reset_game() done = False while not done: state = self.get_state() action = self._select_action(state) real_action = 119 if action == 1 else None action_string = 'eh, jump!' if action == 1 else 'erm, do nothing..' if debug and total_reward > current_reward: print(action_string, 'total rewards:', total_reward) current_reward = total_reward total_reward += self.env.act(real_action) done = self.env.game_over() print('game over!')
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) cell = tf.nn.rnn_cell.LSTMCell(512, state_is_tuple=False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * 512)) self.rnn, self.last_state = tf.nn.dynamic_rnn( inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.tensor_action, self.tensor_validation = tf.split( self.rnn[:, -1, :], 2, 1) self.feed_action = tf.matmul(self.tensor_action, action_layer) self.feed_validation = tf.matmul(self.tensor_validation, action_layer) self.logits = self.feed_validation + tf.subtract( self.feed_action, tf.reduce_mean(self.feed_action, axis=1, keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) replay_size = len(replay) X = np.empty((replay_size, self.INPUT_SIZE)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X: inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() dead = False init_value = np.zeros((1, 2 * 512)) state = self.get_state() for i in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[i, :] = state while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign() if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = sess.run(self.model.logits, self.model.last_state, feed_dict={ self.model.X: [self.INITIAL_FEATURES], self.model.hidden_layer: init_values }) action, init_value = np.argmax(action[0]), last_state[0] real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() dead = self.env.game_over() self._memorize(state, action, reward, new_state, dead, init_value) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, init_values = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={ self.X: X, self.Y: Y, self.hidden_layer: init_values }) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator #setupGame() gameClass = FlappyBird(width=288, height=512, pipe_gap=100) fps = 30 frame_skip = 2 num_steps = 1 force_fps = False display_screen = True reward = 0.0 nb_frames = 15000 game = PLE(gameClass, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) game.init() # store the previous observations in replay memory D = deque() # printing logdir = "logs_" + GAME if not os.path.exists(logdir): os.makedirs(logdir) a_file = open(logdir + "/readout.txt", 'w') h_file = open(logdir + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 r_0 = game.act(game.NOOP) x_t = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOOO" game.reset_game() x_t = cv2.resize(x_t, (80, 80)) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) ''' checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" ''' epsilon = INITIAL_EPSILON t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward r_t = game.act(np.argmax(a_t)) x_t1 = game.getScreenGrayscale() terminal = game.game_over() if terminal: print "NOOO2" game.reset_game() x_t1 = cv2.resize(x_t1, (80, 80)) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) # write info to files '''
if (action == 1): action_value = 119 else: action_value=None if (i>1): for j in range(37-1, 0, -1): x_wall[j] = int(x_wall[j-1]) y_wall[j] = int(y_wall[j-1]) v_wall[j] = int(v_wall[j-1]) a_wall[j] = int(a_wall[j-1]) x_wall[0] = int(x) y_wall[0] = int(y) v_wall[0] = int(v) a_wall[0] = int(action) #reward is +1 if bird fly by the pipe reward = p.act(action_value) my_reward=0 if (reward==1): my_reward = r_1 cumulated[i] += 1 for j in range(1, 40): Q[int(y_wall[j]),int(x_wall[j]),int(v_wall[j]),int(a_wall[j])] += alpha * (my_reward + np.max(Q[int(y_wall[j-1]),int(x_wall[j-1]),int(v_wall[j-1]),int(a_wall[j-1])])) # bad result : -100 if (reward<0): my_reward = r_2 if (x==20): for j in range(0, 27): Q[int(y_wall[j]),int(x_wall[j]),int(v_wall[j]),int(a_wall[j])] += alpha * (my_reward + np.max(Q[int(y_wall[j-1]),int(x_wall[j-1]),int(v_wall[j-1]),int(a_wall[j-1])])) else: for j in range(0, 6):
reward = 0.0 max_noops = 20 nb_frames = 15000 #make a PLE instance. p = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) #our Naive agent! agent = NaiveAgent(p.getActionSet()) #init agent and game. p.init() #lets do a random number of NOOP's for i in range(np.random.randint(0, max_noops)): reward = p.act(p.NOOP) #start our training loop for f in range(nb_frames): #if the game is over if p.game_over(): p.reset_game() obs = p.getScreenRGB() action = agent.pickAction(reward, obs) reward = p.act(action) if f % 50 == 0: p.saveScreen("screen_capture.png")
class Agent: POPULATION_SIZE = 15 SIGMA = 0.1 LEARNING_RATE = 0.03 INITIAL_IMAGES = np.zeros((80, 80, 4)) EPSILON = 0.4 INITIAL_EPSILON = 0.01 WATCHING = 10000 # based on documentation, features got 8 dimensions # output is 5 dimensions, 0 = left, 1 = right, 2 = up, 3 = down, 4 = space def __init__(self, model, screen=False, forcefps=True): self.model = model self.game = MonsterKong() self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.es = Deep_Evolution_Strategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE) self.rewards = [] def _get_image(self, image): r, g, b = image[:, :, 0], image[:, :, 1], image[:, :, 2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size=(80, 80)) def _map_action(self, action): if action == 0: return 97 if action == 1: return 100 if action == 2: return 119 if action == 3: return 115 if action == 4: return 32 def get_predicted_action(self, sequence): if random.random() > self.EPSILON: prediction = np.argmax(self.model.predict(np.array(sequence))[0]) else: prediction = np.random.randint(5) self.EPSILON -= (self.EPSILON / self.WATCHING) return prediction def save(self, checkpoint_name): with open('%s-weight.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.model.get_weights(), fopen) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): with open('%s-weight.p' % (checkpoint_name), 'rb') as fopen: self.model.set_weights(pickle.load(fopen)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, weights): self.model.weights = weights total_reward = 0.0 self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for i in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:, :, i] = state done = False while not done: action = self.get_predicted_action([self.INITIAL_IMAGES]) real_action = self._map_action(action) reward = self.env.act(real_action) reward += random.choice([0.0001, -0.0001]) total_reward += reward state = self._get_image(self.env.getScreenRGB()) self.INITIAL_IMAGES = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis=2) done = self.env.game_over() self.rewards.append(total_reward) return total_reward def fit(self, iterations, checkpoint): self.es.train(iterations, print_every=checkpoint) def play(self, debug=False, not_realtime=False): total_reward = 0.0 current_reward = 0 self.env.force_fps = not_realtime self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for i in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:, :, i] = state done = False while not done: action = self.get_predicted_action(self.INITIAL_IMAGES) real_action = self._map_action(action) if debug and total_reward > current_reward: print(action_string, 'total rewards:', total_reward) current_reward = total_reward total_reward += self.env.act(real_action) state = self._get_image(self.env.getScreenRGB()) self.INITIAL_IMAGES = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis=2) done = self.env.game_over() print('game over!')
class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 INITIAL_FEATURES = np.zeros((4, INPUT_SIZE)) INPUT_SIZE = 8 LAYER_SIZE = 500 OUTPUT_SIZE = 2 # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState # input_size, output_size, layer_size, learning_rate, name self.model = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'real_model') self.model_negative = Model(self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'negative_model') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append( (state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = sess.run(self.model.logits, feed_dict={ self.model.X: states, self.model_negative.hidden_layer: init_values }) Q_new = sess.run(self.model.logits, feed_dict={ self.model.X: new_states, self.model.hidden_layer: init_values }) Q_new_negative = sess.run(self.model_negative.logits, feed_dict={ self.model_negative.X: new_states, self.model_negative.hidden_layer: init_values }) replay_size = len(replay) X = np.empty((replay_size, self.INPUT_SIZE)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * Q_new_negative[ i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" % (checkpoint_name)) with open('%s-acc.p' % (checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() dead = False init_value = np.zeros((1, 2 * 512)) state = self.get_state() for i in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[i, :] = state while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign('real_model', 'target_model') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = sess.run(self.model.logits, self.model.last_state, feed_dict={ self.model.X: [self.INITIAL_FEATURES], self.model.hidden_layer: init_values }) action, init_value = np.argmax(action[0]), last_state[0] real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = np.append(self.get_state(), self.INITIAL_FEATURES[:3, :], axis=0) dead = self.env.game_over() self._memorize(state, action, reward, new_state, dead, init_value) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, init_values = self._construct_memories(replay) cost, _ = self.sess.run( [self.model.cost, self.model.optimizer], feed_dict={ self.model.X: X, self.model.Y: Y, self.model.hidden_layer: init_values }) self.T_COPY += 1 self.rewards.append(total_reward) self.EPSILON = self.MIN_EPSILON + ( 1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i + 1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
class PLEEnv(gym.Env): metadata = {'render.modes': ['human', 'rgb_array']} def __init__(self, game_name='FlappyBird', display_screen=True): # open up a game state to communicate with emulator import importlib game_module_name = ('ple.games.%s' % game_name).lower() game_module = importlib.import_module(game_module_name) game = getattr(game_module, game_name)() self.game = game self.game_state = PLE(game, fps=30, display_screen=display_screen) self.game_state.init() # increase gap for checking #self.game.pipe_gap = 115 #self.game.player.height = 14 self._action_set = self.game_state.getActionSet() self.action_space = spaces.Discrete(len(self._action_set)) self.screen_width, self.screen_height = self.game_state.getScreenDims() #print(self.screen_width, self.screen_height) self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.viewer = None def _step(self, a): reward = self.game_state.act(self._action_set[a]) state = self._get_image() terminal = self.game_state.game_over() return state, reward, terminal, {} def _get_image(self): image_rotated = np.fliplr( np.rot90(self.game_state.getScreenRGB(), 3)) # Hack to fix the rotated image returned by ple return image_rotated @property def _n_actions(self): return len(self._action_set) # return: (states, observations) def _reset(self): self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3)) self.game_state.reset_game() state = self._get_image() return state def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return img = self._get_image() if mode == 'rgb_array': return img elif mode == 'human': from gym.envs.classic_control import rendering if self.viewer is None: self.viewer = rendering.SimpleImageViewer() self.viewer.imshow(img) def _seed(self, seed): rng = np.random.RandomState(seed) self.game_state.rng = rng self.game_state.game.rng = self.game_state.rng self.game_state.init()
class Agent: def __init__(self, hyper: dict, game: PyGameWrapper): self.hyper = hyper self.game = game self.p = PLE(game, fps=30, display_screen=True) self.p.init() self.memory = ReplayBuffer(hyper['obs_dim'], hyper['capacity'], hyper['batch_size']) self.epsilon_decay = hyper['epsilon_decay'] self.epsilon = hyper['max_epsilon'] self.max_epsilon = hyper['max_epsilon'] self.min_epsilon = hyper['min_epsilon'] self.gamma = torch.tensor(hyper['gamma']).to(Pytorch.device()) self.target_update = hyper['target_update'] self.dqn = Network(hyper['obs_dim'], hyper['action_dim']).to(Pytorch.device()) self.dqn_target = Network(hyper['obs_dim'], hyper['action_dim']).to(Pytorch.device()) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() self.optimizer = optim.Adam(self.dqn.parameters()) self.transition = list() self.is_test = hyper['test'] self.epochs = hyper['epochs'] self.batch_size = hyper['batch_size'] self.epoch_log = hyper['epoch_log'] def select_action(self, state: np.ndarray) -> int: def random_action(scale=1): action_max = int(scale * 100) r = random.randint(0, 100) if r <= action_max: return 1 return 0 """ 使用贪心( ε—greedy )搜索方法来对环境进行探索 以 ε—greedy搜索以概率 ε 从所有可能的动作中随机选取一个动作 以 1- ε 的概率选择已知的最好的动作(即当前状态下,Q值最大的那个动作) 在初期, ε 的值应更大一些(即注重对环境的探索),随后逐渐减小 ε 的值(即注重对于Q值表的使用) self.epsilon会随着回合数减小,实现 ε 的值随着回合数的增加而递减。 """ if self.epsilon > np.random.random(): selected_action = random_action() else: # 神经网络得到动作 selected_action = self.dqn( torch.FloatTensor(state).to(Pytorch.device())).argmax() selected_action = selected_action.detach().cpu().item() if not self.is_test: self.transition = [state, selected_action] return selected_action def step(self, action: int): reward = self.p.act(action) # 存储当前状态、行动、奖励、下一步状态、结束状态 if not self.is_test: self.transition += [reward, self.state(), self.p.game_over()] self.memory.store(*self.transition) return reward def update_model(self): samples = self.memory.sample_batch() loss = self._compute_dqn_loss(samples) loss = loss / len(samples) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def _compute_dqn_loss(self, samples: Dict[str, np.ndarray]) -> torch.Tensor: """Return dqn loss.""" device = Pytorch.device() state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["acts"].reshape(-1, 1)).to(device) reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device) done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device) # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise curr_q_value = self.dqn(state).gather(1, action) next_q_value = self.dqn_target(next_state).max( dim=1, keepdim=True)[0].detach() mask = 1 - done target = (reward + self.gamma * next_q_value * mask).to(device) loss = func.smooth_l1_loss(curr_q_value, target) return loss def _target_hard_update(self): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) def state(self): obs = self.game.getGameState() return np.array([ obs['player_y'], obs['player_vel'], obs['next_pipe_dist_to_player'], obs['next_pipe_top_y'], obs['next_pipe_bottom_y'], obs['next_next_pipe_dist_to_player'], obs['next_next_pipe_top_y'], obs['next_next_pipe_bottom_y'] ]) def train(self, ): self.is_test = False epsilons, losses, reward_records, update_cnt = [], [], [], 0, for frame_idx in range(1, self.epochs + 1): self.p.reset_game() reward = 0 while not self.p.game_over(): # 选取动作 state = self.state() action = self.select_action(state) # 执行动作,获取更新的环境状态、奖励、是否完成等,并存储 step_reward = self.step(action) reward = reward + step_reward # 计算损失函数,梯度下降 loss = self.update_model() losses.append(loss) update_cnt += 1 # 减少ε self.epsilon = max( self.min_epsilon, self.epsilon - (self.max_epsilon - self.min_epsilon) * self.epsilon_decay) epsilons.append(self.epsilon) # 更新target神经网络 if update_cnt % self.target_update == 0: self._target_hard_update() reward_records.append(reward) if frame_idx % self.epoch_log == 0: avg_score = '%.2f' % np.mean(reward_records) logger.info("Epoch: %s, Score: %s, Avg-Score: %s, Loss: %s" % (frame_idx, reward, avg_score, loss)) def test(self) -> None: self.is_test = True self.p.reset_game() total_reward = 0 while not self.p.game_over(): action = self.select_action(self.state()) total_reward += self.step(action) logger.info("Total-Reward: %s" % total_reward)
def train_agent(number_of_episodes): game = FlappyBird() rewards = { "positive": 1.0, "negative": 0.0, "tick": 0.0, "loss": -5.0, "win": 0.0 } env = PLE(game=game, fps=30, display_screen=False, reward_values=rewards) # Reset environment at the beginning env.reset_game() training_score = 0 max_training_score = 0 episode_number = 1 state_action_reward = () results = [] state_transition = 0 every_100th = 1 while number_of_episodes > 0: # Get current state state = BasicQLearningAgent.get_state(env.game.getGameState()) # Select action in state "state" action = basic_q_agent.compute_action_from_q_values(state) if action is None: raise IllegalActionException("Illegal action occurred.") """ After choosing action, get reward. PLE environment method act() returns the reward that the agent has accumulated while performing the action. """ reward = env.act(env.getActionSet()[action]) training_score += reward max_training_score = max(training_score, max_training_score) game_over = env.game_over() # observe the result if state_action_reward: basic_q_agent.update(state_action_reward[0], state_action_reward[1], state, state_action_reward[2]) state_transition += 1 state_action_reward = (state, action, reward) if game_over: print("===========================") print("Episode: " + str(episode_number)) print("Training score: " + str(training_score)) print("Max. training score: " + str(max_training_score)) print("===========================\n") if every_100th == 100: results.append((episode_number, training_score)) every_100th = 0 episode_number += 1 every_100th += 1 number_of_episodes -= 1 training_score = 0 state_transition = 0 env.reset_game() f = open("basicq_150000.txt", "w") f.write(str(basic_q_agent.Q_matrix)) f.close() f = open("results_150000.txt", "w") f.write(str(results)) f.close()
p = PLE(game, fps=fps, force_fps=False) agent = NaiveAgent(p.getActionSet()) reward = 0.0 h = HashState(game.getGameState()) # sets up hash value # reads in arguments/file names f = sys.argv[1] o = open(f, 'r') array = agent.file_to_array(o, h.seed) # if no third argument was given, use '2' arg = 2 if len(sys.argv) == 3: arg = int(sys.argv[2]) if arg == 1: # if just using table contents, not learning while True: if p.game_over(): p.reset_game() obs = game.getGameState() mid = obs['frog_y'] > 261.0 obs_value = h.add_table(obs, mid) action = p.act(agent.actions[np.argmax(array[obs_value])]) else: # if 0, starts table from scratch, otherwise resumes from file if arg == 0: array = None # runs learning obs = game.getGameState() action = agent.pickAction(array, obs)
class Agent: LEARNING_RATE = 1e-6 BATCH_SIZE = 32 OUTPUT_SIZE = 2 EPSILON = 1 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 INITIAL_IMAGES = np.zeros((80, 80, 4)) # based on documentation, features got 8 dimensions # output is 2 dimensions, 0 = do nothing, 1 = jump def __init__(self, screen=False, forcefps=True): self.game = FlappyBird(pipe_gap=125) self.env = PLE(self.game, fps=30, display_screen=screen, force_fps=forcefps) self.env.init() self.env.getGameState = self.game.getGameState self.actor = Actor('actor', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.INPUT_SIZE, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.global_variables()) self.rewards = [] def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _get_image(self, image): r, g, b = image[:,:,0], image[:,:,1], image[:,:,2] gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return imresize(gray, size = (80, 80)) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: prediction = self.sess.run(self.actor.logits_actor, feed_dict={self.actor.X:[state]})[0] action = np.argmax(prediction) return action def _construct_memories_and_train(self, replay): # state_r, action_r, reward_r, new_state_r, dead_r = replay # train actor states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q}) self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads}) # train critic rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target}) for i in range(len(replay)): if not replay[0][-1]: rewards[i,0] += self.GAMMA * rewards_target cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer), feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards}) return cost def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X:inputs}) def save(self, checkpoint_name): self.saver.save(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'wb') as fopen: pickle.dump(self.rewards, fopen) def load(self, checkpoint_name): self.saver.restore(self.sess, os.getcwd() + "/%s.ckpt" %(checkpoint_name)) with open('%s-acc.p'%(checkpoint_name), 'rb') as fopen: self.rewards = pickle.load(fopen) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self): state = self.env.getGameState() return np.array(list(state.values())) def get_reward(self, iterations, checkpoint): for i in range(iterations): total_reward = 0 self.env.reset_game() state = self._get_image(self.env.getScreenRGB()) for k in range(self.INITIAL_IMAGES.shape[2]): self.INITIAL_IMAGES[:,:,k] = state dead = False while not dead: if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor', 'actor-target') self._assign('critic', 'critic-target') action = self._select_action(self.INITIAL_IMAGES) real_action = 119 if action == 1 else None reward = self.env.act(real_action) total_reward += reward new_state = self.get_state() state = self._get_image(self.env.getScreenRGB()) new_state = np.append(state.reshape([80, 80, 1]), self.INITIAL_IMAGES[:, :, :3], axis = 2) dead = self.env.game_over() self._memorize(self.INITIAL_IMAGES, action, reward, new_state, dead) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.INITIAL_IMAGES = new_state self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) self.T_COPY += 1 self.rewards.append(total_reward) if (i+1) % checkpoint == 0: print('epoch:', i + 1, 'total rewards:', total_reward) print('epoch:', i + 1, 'cost:', cost) def fit(self, iterations, checkpoint): self.get_reward(iterations, checkpoint)
def main_train(learning = True): final_score = 0 previous_action = 1 model = build_neural_network_model() game = FlappyBird(width=288, height=512, pipe_gap=100) env = PLE(game, fps=30, display_screen=True, state_preprocessor=process_state) env.init() passed = 0 old_y=0 for i in range(game_steps): if i % 10000 == 0: print("STEP {} / {}".format(i, game_steps)) if i == game_steps - 1: print("Score: {}".format(final_score)) if env.game_over(): print("Final Score: {}".format(final_score)) # time.sleep(5) final_score = 0 env.reset_game() observation = env.getGameState() # print( # "player y position {}\n" # "players velocity {}\n" # "next pipe distance to player {}\n" # "next pipe top y position {}\n" # "next pipe bottom y position {}\n" # "next next pipe distance to player {}\n" # "next next pipe top y position {}\n" # "next next pipe bottom y position {}\n".format(observation[0]["player_y"], observation[0]['player_vel'], # observation[0]["next_pipe_dist_to_player"], observation[0]['next_pipe_top_y'], # observation[0]["next_pipe_bottom_y"], observation[0]['next_next_pipe_dist_to_player'], # observation[0]["next_next_pipe_top_y"], observation[0]["next_next_pipe_bottom_y"]) # ) current_state = observation[0] if str(current_state) not in q_dictionary: q_dictionary[str(current_state)] = dict() if 0 not in q_dictionary[str(current_state)]: q_dictionary[str(current_state)][0] = 0 if 1 not in q_dictionary[str(current_state)]: q_dictionary[str(current_state)][1] = 0 for action in [0, 1]: returned_object = generate_next_state(previous_action, current_state, action, passed, old_y) if returned_object[0] == 0: raise NameError("Error. {}".format(returned_object[1])) else: next_state = returned_object[1] reward = returned_object[2] if str(next_state) not in q_dictionary: q_dictionary[str(next_state)] = dict() if 0 not in q_dictionary[str(next_state)]: q_dictionary[str(next_state)][0] = 0 if 1 not in q_dictionary[str(next_state)]: q_dictionary[str(next_state)][1] = 0 q_dictionary[str(current_state)][action] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max(q_dictionary[str(next_state)][0], q_dictionary[str(next_state)][1]) - q_dictionary[str(current_state)][action]) action_to_take = 0 if (q_dictionary[str(current_state)][1] > q_dictionary[str(current_state)][0]): action_to_take = 1 # vector = model.predict([np.matrix(list(current_state.values()))]) # action_to_take = np.argmax(vector[0]) # print(vector[0][0], vector[0][1], action_to_take) # q_dictionary[str(current_state)][0] = vector[0][0] # q_dictionary[str(current_state)][1] = vector[0][1] returned_object = generate_next_state(previous_action, current_state, 0, passed, old_y) if returned_object[0] == 0: raise NameError("Error. {}".format(returned_object[1])) else: reward_to_take = returned_object[2] next_state = returned_object[1] vector = model.predict(np.matrix(list(next_state.values()))) target_to_learn = list() target_to_learn.append(reward_to_take + DISCOUNT_FACTOR * vector[0][0]) returned_object = generate_next_state(previous_action, current_state, 1, passed, old_y) if returned_object[0] == 0: raise NameError("Error. {}".format(returned_object[1])) else: reward_to_take = returned_object[2] next_state = returned_object[1] vector = model.predict(np.matrix(list(next_state.values()))) target_to_learn.append(reward_to_take + DISCOUNT_FACTOR * vector[0][1]) # model.fit(np.matrix(list(current_state.values())), np.matrix(target_to_learn)) """ """ # # returned_object = generate_next_state(previous_action, current_state, action_to_take, passed, old_y) # if returned_object[0] == 0: # raise NameError("Error. {}".format(returned_object[1])) # else: # reward_to_take = returned_object[2] # next_state = returned_object[1] # # target_to_learn = [0, 0] # vector = model.predict(np.matrix(list(next_state.values()))) # value_to_learn = (reward_to_take + DISCOUNT_FACTOR * vector[0][action_to_take]) # if action_to_take == 0: # target_to_learn[action_to_take] = value_to_learn # target_to_learn[1] = q_dictionary[str(current_state)][1] # else: # target_to_learn[action_to_take] = value_to_learn # target_to_learn[0] = q_dictionary[str(current_state)][0] # target_to_learn = [q_dictionary[str(current_state)][0], q_dictionary[str(current_state)][1]] # time.sleep(0.04) model.fit(np.matrix(list(current_state.values())), np.matrix(target_to_learn)) if observation[0]['next_pipe_dist_to_player'] - 4 < 0: passed = 4 old_y = observation[0]['next_pipe_top_y'] # action = agent.pickAction(reward, observation) #nn = random.randint(0, 1) # compute_reward(observation[0]) # nn = int(input("Insert action 0 sau 1")) # reward = env.act(env.getActionSet()[nn]) env_reward = env.act(env.getActionSet()[action_to_take]) if env_reward == 1: final_score += 1 # if env_reward == 1: # action_to_take = 1 # env.act(env.getActionSet()[action_to_take]) # env.act(env.getActionSet()[action_to_take]) previous_action = action_to_take if passed !=0: passed -= 1 print("Saving the model") model.save("model.h5", overwrite=True)
""" def __init__(self, actions): self.actions = actions def pickAction(self, reward, obs): return self.actions[np.random.randint(0, len(self.actions))] ################################### game = Doom(scenario="take_cover") env = PLE(game) agent = NaiveAgent(env.getActionSet()) env.init() reward = 0.0 for f in range(15000): #if the game is over if env.game_over(): env.reset_game() action = agent.pickAction(reward, env.getScreenRGB()) reward = env.act(action) if f > 2000: env.display_screen = True env.force_fps = False if f > 2250: env.display_screen = True env.force_fps = True
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // self.EPS_RATE scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() self.game.getGameState() screen = self.process_screen(p.getScreenRGB()) last_screens_buff = deque([screen] * 4, maxlen=NB_LAST_SCREENS) last_screens = np.stack(last_screens_buff, axis=-1) # gscore = 0 nb_games += 1 score = 0 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save( chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games)) nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) # print('WEIGHTS ABS MEAN') # print(abs(np.mean(self.model.get_weights()[0], axis=1))) # 1) In s, choose a (GLIE actor) qvals = self.get_qvals(last_screens) act = self.greedy_action(qvals, self.epsilon) # 2) Observe r, s′ bare_reward = p.act(ACTIONS[act]) if bare_reward > 0: score += 1 reward = self.reward_engineering(bare_reward) screen_new = self.process_screen(p.getScreenRGB()) # update replay_memory self.replay_memory.append(screen, act, screen_new, reward) if len(self.replay_memory.buff) > self.MIN_REPLAY_MEMORY_SIZE: # build minibatch ls, actions, ls_new, r, terms = self.replay_memory.minibatch( ) qvals_new = self.model_target.predict(ls_new) qvals_new_max = qvals_new.max(1).reshape( (self.BATCH_SIZE, 1)) delta = r + (1 - terms) * self.GAMMA * qvals_new_max qvals = self.model.predict(ls) qvals[np.arange(self.BATCH_SIZE), actions.ravel()] = delta.ravel() self.model.train_on_batch(x=ls, y=qvals) if step % self.TARGET_FREQ == 0: self.model.save(filepath=self.DATA_DIREC + 'target.h5') self.model_target = load_model( filepath=self.DATA_DIREC + 'target.h5') last_screens_buff.append(screen_new) last_screens = np.stack(last_screens_buff, axis=-1) screen = screen_new scores.append(score)
for i in state["creep_pos"]["BAD"]: if math.sqrt((i[0] - x)**2 + (i[1] - y)**2) <= game.AGENT_RADIUS: color = bad_hit_color pygame.draw.circle(screen, color, (int(x), int(y)), 4) # Here <<< pygame.display.update() next_state = p.getGameState() state = next_state p.display_screen = False p.reset_game() # running the game p.act(-1) # this line fixes a weird error(first reward is 1.99)(WHY?? no idea) state = p.getGameState() reward = 0.0 oldest_state = 0 # play_game(1000) for episode in range(nr_episodes): p.reset_game() if episode % 5 == 0 and episode != 0: network.save_model() play_game(100) if (episode == nr_episodes / 2): play_game(10) for step in range(nr_steps_per_episode): epsilon = get_next_epsilon(epsilon) if p.game_over():
def train(self, scratch, game, display): p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=display) t1 = time.time() fname = None if not scratch: fname = self.load() else: delete_files(self.DATA_DIREC) f0, step, nb_save, nb_games = init_train(fname, self.DATA_DIREC) eps_tau = (self.NB_FRAMES - f0) // 8 scores = [] while step < self.NB_FRAMES: if len(scores) == self.SCORE_FREQ: print('States visited:', len(self.Q)) print_scores(scores, self.SCORE_FREQ) scores = [] p.reset_game() state = game.getGameState() state_tp = self.discretize(state) if state_tp not in self.Q: self.Q[state_tp] = [0, 0] act = 1 episode = deque([], self.SIZE_FIFO) elig = {} gscore = 0 nb_games += 1 while not p.game_over(): step += 1 if step != 0 and (step % self.SAVE_FREQ) == 0: self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games) + '.p') nb_save += 1 if step != 0 and (step % self.EPS_UPDATE_FREQ) == 0: self.epsilon = update_epsilon(step, f0, self.EPS0, eps_tau, self.NB_FRAMES) # 1) Observe r, s′ bare_reward = p.act(ACTIONS[act]) reward = self.reward_engineering(bare_reward) new_state = game.getGameState() new_state_tp = self.discretize(new_state) # 2) Choose a′ (GLIE actor) using Q if new_state_tp not in self.Q: self.Q[new_state_tp] = [0, 0] qvals = self.get_qvals(new_state) new_act = self.greedy_action(qvals, self.epsilon) # 3) Temporal difference: δ=r+γQ(s′,a′)−Q(s,a) delta = reward + self.GAMMA * self.Q[new_state_tp][ new_act] - self.Q[state_tp][act] # 4) Update Q episode.append((state_tp, act)) elig[(state_tp, act)] = 1 for (state_tp_ep, act_ep) in episode: self.Q[state_tp_ep][act_ep] += ( self.ALPHA * delta * elig[(state_tp_ep, act_ep)]) elig[(state_tp_ep, act_ep)] *= self.LAMBDA # 5) s<-s', a<-a' state = new_state state_tp = new_state_tp act = new_act if bare_reward > 0: gscore += 1 scores.append(gscore) t2 = time.time() # Unicode code point of a: 97 self.save('Q_' + chr(97 + nb_save) + '_' + str(step) + '_' + str(nb_games) + '.p') print() print('Number of played games:', nb_games) print('Training completed in', (t2 - t1) / 60, 'minutes') print()
from ple import PLE NO_OP = 296 def pickAction(actions): return random.choice(actions) game = Pong_2Player() p = PLE(game, fps=30, display_screen=True, force_fps=False) p.init() actions_1 = [K_w, K_s, NO_OP] actions_2 = [K_a, K_b, NO_OP] nb_frames = 1000 reward_1 = 0.0 reward_2 = 0.0 for f in range(nb_frames): if p.game_over(): #check if the game is over p.reset_game() obs = p.getScreenRGB() action_1 = pickAction(actions_1) reward_1 = p.act(action_1) action_2 = pickAction(actions_2) reward_2 = p.act(action_2)
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while (not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action = FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated) print("Average : ", average_score) print("Max score : ", max_score)
def train_bird(load_weights=False, train=True): game = PLE(FlappyBird(), fps=30, display_screen=True) FLAP = 119 agent = Agent(load_weights, train) # weights_filepath = 'weights/trained_weights.hdf5' # if agent.load_weights: # agent.network.load_weights(weights_filepath) games_played = 0 total_score = 0 # Training while games_played < agent.runs: total_score = 0 # Play a total of 100 games before updating weights for i in range(100): score = 0 game.init() while not game.game_over(): # Greedy exploration old_state = agent.get_state(game) # print(old_state) if random.uniform(0, 1) < agent.epsilon: final_action = to_categorical(randint( 0, 1), num_classes=2) # [1,0] SAU [0,1] else: prediction = agent.network.predict( old_state.reshape((1, 5))) # print(prediction) final_action = to_categorical(np.argmax(prediction[0]), num_classes=2) game.act(game.getActionSet()[np.argmax(final_action)]) reward = 0 if game.getActionSet()[np.argmax(final_action)] == FLAP: reward = agent.get_reward_after_flap(game) else: reward = agent.get_reward(game) score += reward new_state = agent.get_state(game) if agent.train: agent.remember(old_state, final_action, reward, new_state, game.game_over()) #print() print( f'Score: {score} Epsilon: {agent.epsilon} Gamma: {agent.gamma}' ) total_score += score if agent.train: agent.replay_new(agent.memory, agent.batch_size) games_played += 1 print(f'GAME {games_played} Score: {total_score}') # Adjust epsilon for greedy exploration if not agent.train: agent.epsilon = 0.0 agent.gamma = 0.9 else: if agent.epsilon > 0.05: agent.epsilon = 1 - (games_played * agent.epsilon_decay) if agent.gamma <= 0.9: agent.gamma = games_played * agent.gamma_decay
# You're not allowed to change this file from ple.games.flappybird import FlappyBird from ple import PLE import numpy as np from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() reward = 0.0 nb_games = 100 cumulated = np.zeros((nb_games)) for i in range(nb_games): p.reset_game() while(not p.game_over()): state = game.getGameState() screen = p.getScreenRGB() action=FlappyPolicy(state, screen) ### Your job is to define this function. reward = p.act(action) cumulated[i] = cumulated[i] + reward average_score = np.mean(cumulated) max_score = np.max(cumulated)
steps = [] step = 0 plt.ion() for epoch in range(epochs): p.reset_game() for it in range(1000): if p.game_over(): p.reset_game() print "Score:" + str(p.score()) current_state = game.getGameState() processed_current_state = process_state(current_state) action = agent.act(processed_current_state) reward = p.act(actions[action]) rewards.append(reward) next_state = game.getGameState() game_over = p.game_over() processed_next_state = process_state(next_state) agent.remember(processed_current_state, action, reward, processed_next_state, game_over) if len(agent.memory) > 25: agent.replay(25) steps.append(epoch) epsilons.append(agent.epsilon) avg_rewards.append(np.average(rewards)) plt.plot(steps, avg_rewards, 'r')
class MyEnv(Environment): VALIDATION_MODE = 0 def __init__(self, rng, game=None, frame_skip=4, ple_options={"display_screen": True, "force_fps":True, "fps":30}): self._mode = -1 self._mode_score = 0.0 self._mode_episode_count = 0 self._frameSkip = frame_skip if frame_skip >= 1 else 1 self._random_state = rng if game is None: raise ValueError("Game must be provided") self._ple = PLE(game, **ple_options) self._ple.init() w, h = self._ple.getScreenDims() self._screen = np.empty((h, w), dtype=np.uint8) self._reducedScreen = np.empty((48, 48), dtype=np.uint8) self._actions = self._ple.getActionSet() def reset(self, mode): if mode == MyEnv.VALIDATION_MODE: if self._mode != MyEnv.VALIDATION_MODE: self._mode = MyEnv.VALIDATION_MODE self._mode_score = 0.0 self._mode_episode_count = 0 else: self._mode_episode_count += 1 elif self._mode != -1: # and thus mode == -1 self._mode = -1 self._ple.reset_game() for _ in range(self._random_state.randint(15)): self._ple.act(self._ple.NOOP) self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) return [4 * [48 * [48 * [0]]]] def act(self, action): action = self._actions[action] reward = 0 for _ in range(self._frameSkip): reward += self._ple.act(action) if self.inTerminalState(): break self._screen = self._ple.getScreenGrayscale() cv2.resize(self._screen, (48, 48), self._reducedScreen, interpolation=cv2.INTER_NEAREST) self._mode_score += reward return np.sign(reward) def summarizePerformance(self, test_data_set): if self.inTerminalState() == False: self._mode_episode_count += 1 print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) def inputDimensions(self): return [(4, 48, 48)] def observationType(self, subject): return np.uint8 def nActions(self): return len(self._actions) def observe(self): return [np.array(self._reducedScreen)] def inTerminalState(self): return self._ple.game_over()
"player_vel": self.playerVelY, "next_pipe_dist_to_player": pipes[0][1], "next_pipe_top_y": pipes[0][0][0]["y"] + pipeHeight, "next_pipe_bottom_y": pipes[0][0][1]["y"], "next_next_pipe_dist_to_player": pipes[1][1], "next_next_pipe_top_y": pipes[1][0][0]["y"] + pipeHeight, "next_next_pipe_bottom_y": pipes[1][0][1]["y"] } return state def getScore(self): return self.score if __name__ == '__main__': from ple import PLE pygame.init() game = FlappyClone(black=False) env = PLE(game, display_screen=True, force_fps=False, fps=30) env.init() while True: if env.game_over(): print("Dead") env.reset_game() # print(game.getGameState()) reward = env.act(None)