def train_network(model, args): # open up a game state to communicate with emulator game_state = GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = rgb2gray(x_t) x_t = transform.resize(x_t, (80, 80)) x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) x_t = x_t / 255.0 s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) #print (s_t.shape) #In Keras, need to reshape s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*80*80*4 if args['mode'] == 'Run': OBSERVE = 999999999 #We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") model.load_weights("model.h5") adam = Adam(lr=LEARNING_RATE) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") else: #We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON t = 0 while (True): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 a_t = np.zeros([ACTIONS]) #choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: q = model.predict( s_t) #input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q a_t[max_Q] = 1 #We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #run the selected action and observed next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = skimage.color.rgb2gray(x_t1_colored) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t1 = x_t1 / 255.0 x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1 s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() #only train if done observing if t > OBSERVE: #sample a minibatch to train on minibatch = random.sample(D, BATCH) #Now we do the experience replay state_t, action_t, reward_t, state_t1, terminal = zip(*minibatch) state_t = np.concatenate(state_t) state_t1 = np.concatenate(state_t1) targets = model.predict(state_t) Q_sa = model.predict(state_t1) targets[range(BATCH), action_t] = reward_t + GAMMA * np.max( Q_sa, axis=1) * np.invert(terminal) loss += model.train_on_batch(state_t, targets) s_t = s_t1 t = t + 1 # save progress every 10000 iterations if t % 1000 == 0: print("Now we save model") model.save_weights("model.h5", overwrite=True) with open("model.json", "w") as outfile: json.dump(model.to_json(), outfile) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss) print("Episode finished!") print("************************")
epsilon = agent.initial_epsilon decay = 0.999996 replay_memory = [] game_state = GameState() state = torch.tensor(game_state.initial_state()[0], dtype=torch.float32) losses = [] loss_counter = 0 epoch_loss = 0 max_score, old_score = 0, 0 for epoch in range(agent.number_of_iterations): check_exit() clock.tick(10) output = agent(state) action = select_action(output) new_state, new_reward, is_state_terminal, score = game_state.frame_step( action) if score > max_score: old_score = max_score max_score = score game_state.print_console_data( f"current score: {score}, max score: {max_score}") state = torch.tensor(new_state, dtype=torch.float32)