def playFlappyBird(): actions = 2 brain = DeepQN(actions) flappyBird = game.GameState() action0 = np.array([1,0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0,1,255,cv2.THRESH_BINARY) brain.setInitState(observation0) score = 0 time = 0 x = [] y = [] while True: action = brain.getAction() nextObservation,reward,terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation,action,reward,terminal) if reward != 0.1: time += 1 x.append(time) score += reward y.append(score) if time == 1000: break plt.plot(x,y) plt.show()
def play(): actions = 2 agent = DQN(actions) flappyBird = game.GameState() # play game # obtain init state action0 = np.array([1, 0]) observation0, reward0, is_terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) observation0 = np.asarray(observation0, dtype=np.float64) agent.setInitState(observation0) """ m = np.max(observation0) observation0 = 1.0 / m * observation0 agent.setInitState(observation0) """ while 1 != 0: action = agent.getAction() print agent.timeStep nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) agent.setPerception(nextObservation, action, reward, terminal)
def TrainFlappyBird(): # Train AI for the Flappy Bird game # Initialize Flappy Bird game flappybird = game.GameState() # Initialize AI for training num_actions = 2 AI_player = DQN_AI(num_actions = num_actions, mode = 'train') # AI training # Initialize the first state of AI with the first observation from the game action = np.array([1,0]) # idle observation, reward, terminal = flappybird.frame_step(action) observation = Preprocess(observation) AI_player.Current_State_Initialze(observation = observation) # AI starts training while True: # Keep training until hitting 'ctrl + c' print('time step: %d' % AI_player.time_step) action = AI_player.AI_Action() next_observation, reward, terminal = flappybird.frame_step(action) next_observation = Preprocess(next_observation) AI_player.Q_CNN_Train(action = action, reward = reward, observation = next_observation, terminal = terminal)
def trainNet(self): game_state = game.GameState() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (HEIGHT, WIDTH)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.zeros([FRAME, HEIGHT, WIDTH]) for i in range(FRAME): s_t[i, :, :] = x_t epsilon = INITIAL_EPSILON if not args.mode == "train": epsilon = 0 while "flappy bird" != "angry bird": input_frame = np.reshape(s_t, (1, FRAME, HEIGHT, WIDTH)) self.tg_net.forward(mx.io.DataBatch([mx.nd.array(input_frame, self.ctx)], [])) qvalue = np.squeeze(self.tg_net.get_outputs()[0].asnumpy()) a_t = np.zeros([ACTIONS]) action_index = 0 if self.timestep % FRAME_PER_ACTION == 0: if random.random() <= epsilon: # print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(qvalue) a_t[action_index] = 1 # print "----------Net Action----------", action_index else: a_t[0] = 1 # do nothing # run the selected action and observe next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (HEIGHT, WIDTH)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (1, HEIGHT, WIDTH)) s_t1 = np.vstack((x_t1, s_t[:(FRAME-1), :, :])) if args.mode == "train": # store the transition in replay memory self.replayMemory.append((s_t, a_t, r_t, s_t1, terminal)) # scale down epsilon if epsilon > FINAL_EPSILON and self.timestep > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE if len(self.replayMemory) > REPLAY_MEMORY: self.replayMemory.popleft() # only train if done observing if self.timestep > OBSERVE: self.trainStep() if self.timestep <= OBSERVE: state = "observe" elif self.timestep > OBSERVE and self.timestep <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", self.timestep, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q ", qvalue) s_t = s_t1 self.timestep += 1
def playFlappyBird(): episodeMemory = [] # Step 1: init BrainDQN actions = 2 brain = BrainDQN(actions) # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = np.array([1, 0]) for event in pygame.event.get(): if event.type == KEYDOWN and event.key == K_UP: action = np.array([0, 1]) if event.type == QUIT or (event.type == KEYDOWN and event.key == K_ESCAPE): pygame.quit() sys.exit() nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) if terminal: episodeMemory.append([nextObservation, action, reward, terminal]) save2file(episodeMemory) episodeMemory = [] else: episodeMemory.append([nextObservation, action, reward, terminal])
def test_DQN(dqn, episode): """ test DQN model param dqn: dqn model param episode: current episode """ #test on 5 games case_num = 5 dqn.close_train() ave_step = 0 for i in range(case_num): dqn.time_step = 0 flappyBird = game.GameState() o, r, terminal = flappyBird.frame_step([1,0]) o = preprocess(o) dqn.reset_state() #play game until game end while True: action = dqn.get_action_optim() o, r, terminal = flappyBird.frame_step(action) if terminal: break #game over o = preprocess(o) dqn.currt_state = np.append(dqn.currt_state[1:,:,:], o, axis=0) dqn.increase_step() ave_step += dqn.time_step ave_step = ave_step / case_num print("episode:{}, average game steps:{}".format(episode, ave_step)) return ave_step
def play_game(model_file_name, cuda=False, best=True): """Play flappy bird with pretrained dqn model weight -- model file name containing weight of dqn best -- if the model is best or not """ print 'load pretrained model file: ' + model_file_name model = BrainDQN(epsilon=0., mem_size=0, cuda=cuda) load_checkpoint(model_file_name, model) model.set_eval() bird_game = game.GameState() model.set_initial_state() if cuda: model = model.cuda() while True: action = model.get_optim_action() o, r, terminal = bird_game.frame_step(action) if terminal: break o = preprocess(o) model.current_state = np.append(model.current_state[1:, :, :], o.reshape((1, ) + o.shape), axis=0) model.increase_time_step() print 'total time step is {}'.format(model.time_step)
def play_flappybird(): brain_config = { 'network_type': 'cnn', 'learning': False, 'num_actions': 2, 'lookback_window': 3 } brain = TFBrain(brain_config) brain.show_configs() bird_env = game.GameState() frame_per_action = 1 action = np.array([1, 0]) observation, reward, chain_end = bird_env.frame_step(action) observation = preprocess(observation) observation = np.reshape(observation, (observation.shape[0], observation.shape[1])) state = init_state(observation) i = 0 while 1 != 0: if i % frame_per_action == 0: action = brain.decide(state, determistic=True) else: # Do nothing action = np.array([1, 0]) i += 1 next_observation, reward, chain_end = bird_env.frame_step(action) next_observation = preprocess(next_observation) next_state = proceed_state(state, next_observation) experience = Experience(state, action, reward, next_state, chain_end) state = next_state
def __init__( self, name, sess, ac_parms, globalAC, game_name, ): super(Worker, self).__init__() self.name = name self.sess = sess self.ac_parms = ac_parms self.globalAC = globalAC self.env = gym.make(game_name).unwrapped self.AC = ACNet4CartPole( n_actions=self.ac_parms['n_actions'], n_features=self.ac_parms['n_features'], sess=self.sess, globalAC=globalAC, scope=self.name, OPT_A=self.ac_parms['OPT_A'], OPT_C=self.ac_parms['OPT_C'], ) self.memory = A3CMemory() self.GameState = game.GameState()
def test(): model = load_model('model') game_state = game.GameState() do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=0) t = 0 while True: predict_batch = s_t[np.newaxis, :] readout_t = model.predict(predict_batch)[0] a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) q_value = np.max(readout_t) print q_value a_t[action_index] = 1 x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (1, 80, 80)) s_t1 = np.append(x_t1, s_t[:3, :, :], axis=0) s_t = s_t1 t += 1 if terminal: break
def playFlappyBird(mode): env = game.GameState() Deep_Q_Network = DQN(ACTIONS) do_nothing = np.zeros(ACTIONS) do_nothing[1] = 1 observation0, reward0, terminal = env.frame_step(do_nothing) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0,1,255,cv2.THRESH_BINARY) Deep_Q_Network.setInitState(observation0) while 1!= 0: action = Deep_Q_Network.getAction() nextObservation,reward,terminal = env.frame_step(action) nextObservation = preprocess(nextObservation) # Train if mode == "train": INITIAL_EPSILON = 1.0 Deep_Q_Network.setPerception(nextObservation,action,reward,terminal) # Interfere if mode == "test": Deep_Q_Network.interfere(nextObservation) cv2.imshow("", nextObservation ) if cv2.waitKey(1) == 27: break
def play(model_file_name, config): print('load pretrained model file: ' + model_file_name) agent = Agent(config) load_checkpoint(model_file_name, agent.model) bird_game = game.GameState() total_reward = 0. time_count = 0. # 1.init S action = [1, 0] # do nothing state = init_state() obs, reward, terminal = bird_game.frame_step(action) obs = preprocess(obs) state = np.append(state[1:, :, :], obs.reshape((1, ) + obs.shape), axis=0) while not terminal: action = agent.optimal_action(state) next_obs, reward, terminal = bird_game.frame_step(action) next_obs = preprocess(next_obs) next_state = np.append(state[1:, :, :], next_obs.reshape((1, ) + next_obs.shape), axis=0) state = next_state total_reward += reward time_count += 1 print('total time step is {}'.format(time_count))
def test_dqn(model, episode): ave_time = 0. for test_case in range(5): time_step = 0 flappyBird = game.GameState() o, r, terminal = flappyBird.frame_step([1, 0]) obs = preprocess(o) state = init_state() state = np.append(state[1:, :, :], obs.reshape((1, ) + obs.shape), axis=0) while True: action = model.optimal_action(state) o, r, terminal = flappyBird.frame_step(action) if terminal: break o = preprocess(o) state = np.append(state[1:, :, :], o.reshape((1, ) + o.shape), axis=0) time_step += 1 ave_time += time_step ave_time /= 5 print('testing: episode: {}, average time: {}'.format(episode, ave_time)) return ave_time
def play(): network = DQN(2) # init network with 2 actions # init flappybird game and the first state flappy_bird = game.GameState() action = np.array([1, 0]) state, reward, done, _ = flappy_bird.frame_step(action) state = preprocess(state) state = state.reshape((80, 80)) network.current_state = np.stack((state, state, state, state), axis=2) # play game _score = 0 total = 0.0 num_game = 1 while True: action = network.get_action() next_state, reward, done, score = flappy_bird.frame_step(action) if score != 0: _score = score if done: print "TIME", network.time_in_game, "GAME NUM", num_game, "EPSILON", network.epsilon, "SCORE", score total += _score num_game += 1 score = 0 if num_game % 10 == 0: print "\nlast 10 game avg score", total / num_game, "\n" total = 0.0 next_state = preprocess(next_state) network.process(next_state, action, reward, done)
def run_network(s, readout, h_fc1, sess): game_state = game.GameState() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) a_t[action_index] = 1 # run the selected action and observe next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t = np.append(x_t1, s_t[:, :, :3], axis=2)
def playFlappyBird(): # Step 1: init BrainDQN brain = BrainDQN() # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([ 1, 0 ]) # do nothing (I think it acturally takes a random act from 0 or 1) # get all the returned values observation0, reward0, terminal = flappyBird.frame_step(action0) #transfer into grayscale observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) # Initialize the state by the first observation brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() # pass action in and get the return value nextObservation, reward, terminal = flappyBird.frame_step(action) # get grayscale nextObservation = preprocess(nextObservation) # Record this state, put it into experience to be avaliable for the batch # and train the model by Q-learning brain.setPerception(nextObservation, action, reward, terminal)
def main(): begin_time = datetime.datetime.now() env = game.GameState() brain = rl_brain_pytorch.DeepQNetwork() step = 0 for episode in range(rl_brain_pytorch.MAX_EPISODE): # do nothing observation, _, _ = env.frame_step([1,0]) observation = preprocess(observation, False) brain.reset(observation) score = 0.0 while True: action = brain.choose_action(observation) observation_, reward, done = env.frame_step(action) if reward == 1: score+=1 observation_ = preprocess(observation_, True) if TRAINING: brain.store_transition(observation, action, reward, done, observation_) # 有一定的记忆就可以开始学习了 if step > 200: if TRAINING: brain.learn() if done: break observation = observation_ step += 1 end_time = datetime.datetime.now() print("episode {} over. exec time:{} step:{} score:{}".format(episode, end_time - begin_time, step,score)) brain.saveNet() env.exit("game over")
def test_dqn(model, episode): """Test the behavor of dqn when training model -- dqn model episode -- current training episode """ model.set_trainable(False) ave_time = 0. for test_case in range(5): model.time_step = 0 flappyBird = game.GameState() o, r, terminal = flappyBird.frame_step([1, 0]) o = preprocess(o) while True: action = model.optimal_action() o, r, terminal = flappyBird.frame_step(action) if terminal: break o = preprocess(o) model.current_state = np.append(model.current_state[1:, :, :], o.reshape((1, ) + o.shape), axis=0) model.increase_timestep() ave_time += model.time_step ave_time /= 5 print('testing: episode: {}, average time: {}'.format(episode, ave_time)) return ave_time
def launch_game(self): self.game_state = game.GameState() print self.game_state actions = np.zeros(self.number_of_actions, dtype='int32') print actions actions[0] = 1 self.initial_state, reward, done = self.game_state.frame_step(actions)
def trainAgentEpisodic(agent): # open up a game state to communicate with emulator game_state = game.GameState() #agent = agent_class('model.h5', memory_size=REPLAY_MEMORY, # Epsilon=rangefloat(INITIAL_EPSILON,FINAL_EPSILON,EXPLORE), # K=FRAME_PER_ACTION) fillMemory(game_state, agent, OBSERVATION) episode_scores = [] with trange(0, NUM_OF_EPISODES) as episodes: episodes.set_description('Training...') steps = 0 for episode in episodes: _, stp = runEpisode(game_state, agent, training=True) steps += stp if (episode + 1) % 20 == 0: episodes.set_description('Testing...') score, _ = runEpisode(game_state, agent, training=False) episode_scores.append(score) episodes.set_description( 'Reward {:.2f} | Epsilon: {:.6f} | Steps {!s} | Training...' .format(np.mean(episode_scores), agent.get_epsilon(), steps)) print('\nMean: {:.3f} Std: {:.3}'.format(np.mean(episode_scores), np.std(episode_scores))) plt.plot(range(0, NUM_OF_EPISODES, 20), episode_scores, 'ro') plt.ylabel('Score') plt.show() print("Episode finished!") print("************************")
def playGame(): # open up a game state to communicate with emulator game_state = game.GameState() # threading.Timer(0.1, processFrames).start() # threading.Timer(1.5, saveFrame).start() while True: t_start = time.time() keys = pygame.key.get_pressed() if (keys[K_ESCAPE]): cap.release() videoSaver.release() cv2.destroyAllWindows() return if (keys[K_f]): APP_CONFIG['save'] = False print('off') if (keys[K_n]): APP_CONFIG['save'] = True print('on') frame = processFrames() #drawFitRectangle(frame) faceFrame = excludeFace(frame) shouldJump = isMouth(faceFrame) saveFrame(frame) t_end = time.time() t_delta = int((t_end - t_start) * 1000) print(t_delta) while (t_delta >= 0): game_state.frame_step(getAction(shouldJump)) t_delta -= 8 #game_state.frame_step(getAction(shouldJump)) drawFrame(frame)
def TrainFlappyBirdResume(): # Resume training in case of break # Initialize Flappy Bird game flappybird = game.GameState() # Initialize AI for training num_actions = 2 AI_player = DQN_AI(num_actions = num_actions, mode = 'train') # Set AI parameters to resume AI_player.Load_Model() AI_player.epsilon = 0 # user could adjust epsilon for the training after resume # AI training # Initialize the first state of AI with the first observation from the game action = np.array([1,0]) # idle observation, reward, terminal = flappybird.frame_step(action) observation = Preprocess(observation) AI_player.Current_State_Initialze(observation = observation) # AI starts training while True: # Keep training until hitting 'ctrl + c' print('time step: %d' % AI_player.time_step) action = AI_player.AI_Action() next_observation, reward, terminal = flappybird.frame_step(action) next_observation = Preprocess(next_observation) AI_player.Q_CNN_Train(action = action, reward = reward, observation = next_observation, terminal = terminal)
def playFlappyBird(): # Step 1: init BrainDQN actions = 2 brain = BrainDQN(actions) # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) print observation0.shape brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() print action nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal) cv2.imshow('Video', 0) if cv2.waitKey(1) & 0xFF == 27: break
def playFlappyBird(): # Step 1: init BrainDQN actions = 2 brain = BrainDQN(actions) # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() # 对初始的状态有action反馈 at nextObservation, reward, terminal = flappyBird.frame_step( action) # 执行器获得指令,并输出该指令的奖励r(t),执行该指令导致的观测 o(t+1), nextObservation = preprocess(nextObservation) tmp_img = showThreshImg(nextObservation) # 从flappyBird中得到的图像是一个旋转加镜像的 cv2.imshow("process", tmp_img) cv2.waitKey(1) brain.setPerception(nextObservation, action, reward, terminal)
def func_Train_Network(var_Build_Models_, parm_Args_): # We will open the game now on the emulator var_State_Game_ = imp_Game_.GameState() # We will store the old observations into the replay memory var_D_ = imp_DQ()
def playFlappyBird(): #init BrainDQN actions = 2 brain = BrainDQN(actions) #play the game forever flappyBird = game.GameState() while True: #init Flappy Bird Game action = np.array([1, 0]) observation, reward, terminal, screenCap = flappyBird.frame_step( action) observation = cv2.cvtColor(cv2.resize(observation, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation = cv2.threshold(observation, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation) #run the game while True: action = brain.getAction() nextObservation, reward, terminal, screenCap = flappyBird.frame_step( action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal)
def playFlappyBird(pretrained): if not os.path.exists('saved_networks'): os.makedirs('saved_networks') # Step 1: init BrainDQN actions = 2 #brain = BrainDQN(actions,param_file='saved_networks/network-dqn_gluon34900.params') if pretrained != "": brain = BrainDQN(actions, param_file=pretrained) else: brain = BrainDQN(actions) # Step 2: init Flappy Bird Game flappyBird = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0]) # do nothing observation0, reward0, terminal = flappyBird.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() nextObservation, reward, terminal = flappyBird.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal)
def test(self, num_actions): self.saver.restore(self.session, FLAGS.checkpoint_path) print("Restored model weights from ", FLAGS.checkpoint_path) game_state = game.GameState() do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t1_colored, r_0, terminal = game_state.frame_step(do_nothing) x_t1 = skimage.color.rgb2gray(x_t1_colored) x_t1 = skimage.transform.resize(x_t1, (80, 80)) state = np.stack((x_t1, x_t1, x_t1, x_t1), axis=2) state = state.reshape(state.shape[0], state.shape[1], state.shape[2]) #shape(1,80,80,4) for i_episode in xrange(FLAGS.num_eval_episodes): episode_reward = 0 done = False while not done: q_values = self.q_values.eval(session=self.session, feed_dict={self.state: [state]}) action_index = np.argmax(q_values) action = np.zeros([num_actions]) action[action_index] = 1 x_t1_colored, reward, done = game_state.frame_step(action) x_t1 = skimage.color.rgb2gray(x_t1_colored) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #shape(1,80,80,1) new_state = np.append(x_t1, state[:, :, :3], axis=2) state = new_state episode_reward += reward print("Finished episode " + str(i_episode + 1) + " with score " + str(episode_reward))
def train(self, num_actions): # Initialize target network weights # Initialize all variables init_op = tf.global_variables_initializer() self.session.run(init_op) self.session.run(self.update_target) # Inititalize learning rate self.lr = FLAGS.learning_rate self.threadLock = threading.Lock() game_states = [game.GameState() for i in range(FLAGS.num_concurrent)] if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) # Initialize variables self.session.run(tf.initialize_all_variables()) # Start num_concurrent actor-learner training threads actor_learner_threads = [ threading.Thread(target=self.actor_learner_thread, args=(game_states[thread_id], thread_id, num_actions)) for thread_id in range(FLAGS.num_concurrent) ] for t in actor_learner_threads: t.start() for t in actor_learner_threads: t.join()
def trainModel(predict_model,actual_model,mode): fb = game.GameState() replay_mem = [] i_0, r_0, isDead = fb.frame_step(0) i_0 = processImage(i_0) state_0 = np.stack((i_0,i_0,i_0,i_0), axis=2) state_0 = state_0.reshape(1,rows,cols,stack) state_t = state_0 t = 0 if mode == 'Train' log = file(strftime("%Y-%m-%d-%H:%M:%S", gmtime()) , 'w') while True: loss = 0 q_max = 0 if random.random() < ep and mode == 'Train': print 'taking random action' flap = random.randint(0,1) else: q = predict_model.predict(state_t) flap = np.argmax(q[0]) q_max = max(q[0]) i_t, r_t, isDead = fb.frame_step(flap) i_t = processImage(i_t) i_t = i_t.reshape(1, rows, cols ,1) state_t1 = np.append(i_t, state_t[:, :, :, :3], axis=3) replay_mem += [(state_t,state_t1,flap,r_t,isDead)] if t > 50000: replay_mem = replay_mem[1:] if t > 3000 and mode == 'Train': batch = random.sample(replay_mem,batch_size) X = np.zeros((batch_size,84,84,4)) Y = np.zeros((batch_size,2)) for i in range(len(batch)): X[i:i+1] = batch[i][0] q = predict_model.predict(batch[i][0]) Y[i] = q[0] action = batch[i][2] reward = batch[i][3] if batch[i][4] == True: Y[i,action] = reward else: max_a = np.argmax(predict_model.predict(batch[i][1])[0]) q = actual_model.predict(batch[i][1])[0][max_a] Y[i,action] = reward + discount_factor*q loss = predict_model.train_on_batch(X,Y) if t%1000 == 0 and mode == 'Train': print 'sync two models...' actual_model.set_weights(predict_model.get_weights()) predict_model.save_weights("model-ddqn.h5", overwrite=True) # Do I need to re-compile the model here ? print 'T = '+str(t)+' Loss = '+str(loss)+' q-max = '+str(q_max)+' reward = '+str(r_t)+' action = '+str(flap) if mode == 'Train': log.write('T = '+str(t)+' Loss = '+str(loss)+' q-max = '+str(q_max)+' reward = '+str(r_t)+' action = '+str(flap)) t += 1 state_t = state_t1