def populate_memory(): memory = Memory(max_size=memory_size) state = World.player # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): if ii % 1000 == 0: print("Pre-training...") # Make a random action action = random.choice(actions) state, action, reward, next_state = move(action) if World.has_restarted(): # The simulation ends so no next state next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state)) World.restart_game(board_rs=False) else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state return memory, state
def run(saver): time.sleep(0.1) test_episodes = 10 test_max_steps = 600 with tf.Session(graph=graph1) as sess: saver.restore(sess, tf.train.latest_checkpoint('checkpoints')) for ep in range(1, test_episodes): t = 0 state = World.player while t < test_max_steps: time.sleep(0.1) # Get action from Q-network feed = {mainQN.inputs_: state.reshape((1, *state.shape))} Qs = sess.run(mainQN.output, feed_dict=feed) action = np.argmax(Qs) print(action) # Take action, get new state and reward state, action, reward, next_state = move(action) if World.has_restarted(): t = test_max_steps print("Game restart, score: ", reward) World.restart_game() time.sleep(0.01) else: state = next_state t += 1
def main(): memory, state = populate_memory() rewards_list, saver = train(memory) #plot(rewards_list) t = threading.Thread(target=run, args=(saver,)) #t = threading.Thread(target=train, args=(memory)) t.daemon = True t.start() World.start_game() plot(rewards_list)
def run(): global discount, epsilon time.sleep(0.1) alpha = .08 # learning rate t = 1 # time ep = 0 p = 0.001 total_r = 0 while True: #board = World.board_image #board.show() s = World.player s = (s[0], s[1]) # choose an action: max_act, _ = max_Q(s) #max_act = policy(max_act) # perform action and get new state and received reward (s, a, r, s_2) = move(max_act) _, maxQ = max_Q(s_2) update_Q(s, a, alpha, r, discount, s_2, maxQ) t += 1 total_r += r if World.has_restarted(): print("World restart, score: ", total_r) rewards_list.append((ep, total_r)) total_r = 0 print(World.player) World.restart_game() #board = World.board_image time.sleep(0.1) t = 1 epsilon *= 0.995 epsilon = max(0.05, epsilon) ep += 1 if epsilon < 0.3: p = 0.1 # update learning rate #alpha = pow(t, -0.1) time.sleep(0.01)
def move(act): s_1 = World.player reward = -World.score # Up, down, left, right if act == actions[0]: World.try_move(0, -1) if act == actions[1]: World.try_move(0, 1) if act == actions[2]: World.try_move(-1, 0) if act == actions[3]: World.try_move(1, 0) s_2 = World.player reward += World.score return s_1, act, reward, s_2
def train(memory): time.sleep(0.1) # Now train with experiences saver = tf.train.Saver() rewards_list = [] with tf.Session(graph=graph1) as sess: # Initialize variables sess.run(tf.global_variables_initializer()) tau = 0 update_target = update_target_graph() sess.run(update_target) step = 0 for ep in range(1, train_episodes): total_reward = 0 t = 0 state = World.player while t < max_steps: step += 1 tau += 1 # Explore or Exploit explore_p = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * step) if explore_p > np.random.rand(): # Make a random action action = random.choice(actions) else: # Get action from Q-network feed = {mainQN.inputs_: state.reshape((1, *state.shape))} Qs = sess.run(mainQN.output, feed_dict=feed) action = np.argmax(Qs) # Take action, get new state and reward state, action, reward, next_state = move(action) total_reward += reward if World.has_restarted(): # the episode ends so no next state next_state = np.zeros(state.shape) t = max_steps print('Episode: {}'.format(ep), 'Total reward: {:.4f}'.format(total_reward), 'Training loss: {:.4f}'.format(loss), 'Explore P: {:.4f}'.format(explore_p)) rewards_list.append((ep, total_reward)) # Add experience to memory memory.add((state, action, reward, next_state)) # Start new episode World.restart_game(board_rs=False) else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state t += 1 # Sample mini-batch from memory batch = memory.sample(batch_size) states = np.array([each[0] for each in batch]) acts = np.array([each[1] for each in batch]) rewards = np.array([each[2] for each in batch]) next_states = np.array([each[3] for each in batch]) # DOUBLE DQN Logic: # Use DQNNetwork to select the action to take at next_state (a') (action with the highest Q-value) # Use TargetNetwork to calculate the Q_val of Q(s',a') # Calculate Q_target for all actions that state Qs_target_next_state = sess.run(TargetNetwork.output, feed_dict={TargetNetwork.inputs_: next_states}) # Get Q values for next_state Qs_next_state = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states}) target_Qs_batch = [] # Set target_Qs to r for states where episode ends episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1) for i in range(0, len(batch)): terminal = episode_ends[i] action = np.argmax(Qs_next_state[i]) if terminal: target_Qs_batch.append(rewards[i]) else: target = rewards[i] + gamma * Qs_target_next_state[i][action] target_Qs_batch.append(target) #targets = rewards + gamma * np.max(target_Qs, axis=1) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run([mainQN.loss, mainQN.opt], feed_dict={mainQN.inputs_: states, mainQN.targetQs_: targets_mb, mainQN.actions_: acts}) if tau > max_tau: print("Model updated") tau = 0 update_target = update_target_graph() sess.run(update_target) save_path = saver.save(sess, "checkpoints/model_dqn.ckpt") print("Model saved in path: %s" % save_path) return rewards_list, saver
def update_Q(s, a, alpha, reward, gamma, s_2, maxQ): Q[s][a] = (1 - alpha) * Q[s][a] + alpha * (reward + gamma * maxQ) World.set_cell_score(s, a, Q[s][a])
discount = .8 actions = World.actions # ["up", "down", "left", "right"] states = [] # states = (x, y) location Q = {} # Set state-coordinates for i in range(World.x): for j in range(World.y): states.append((i, j)) # Init Q matrix and set colors for cells for state in states: temp = {} for action in actions: temp[action] = 0.1 World.set_cell_score(state, action, temp[action]) Q[state] = temp for (i, j, c, w) in World.specials: for action in actions: Q[(i, j)][action] = w World.set_cell_score((i, j), action, w) def move(action): s_1 = World.player s_1 = (s_1[0], s_1[1]) reward = -World.score # Up, down, left, right if action == actions[0]: World.try_move(0, -1)