def trainNetwork(myAgent, sess): # Open up a game state to communicate with emulator. game_state = game.GameState() # Initialize the sate of the game. do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # Initialize the episose history. ep_history = [] # Initialize a saver. saver = tf.train.Saver() # Initialize all variables. sess.run(tf.initialize_all_variables()) # Restore the checkpoints. checkpoint = tf.train.get_checkpoint_state( "saved_networks_policy_gradient") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # Initialize the grad_buffer. gradBuffer = sess.run(tf.trainable_variables()) for ix, grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 # Initialize the epsilon value for the exploration phase. epsilon = INITIAL_EPSILON # Initialize the iteration counter. t = 0 score = [] # For all episodes. while True: # Choose an action epsilon-greedily. readout_t = myAgent.readout.eval( feed_dict={myAgent.state_in: [s_t]})[0] action_index = get_action_index(readout_t, epsilon, t) a_t = np.zeros([ACTIONS]) a_t[action_index] = 1 # Scale down epsilon during the exploitation phase. epsilon = scale_down_epsilon(epsilon, t) for i in range(0, K): # Run the selected action and observe next state and reward. s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state) # Store the transition in the replay memory. ep_history.append([s_t, a_t, r_t, s_t1]) if (terminal): score.append(game_state.bar1_pre - game_state.bar2_pre) if (terminal): break # If the episode is over if (terminal): s_j = [d[0] for d in ep_history] a_j = [d[1] for d in ep_history] r_j = [d[2] for d in ep_history] s_j1 = [d[3] for d in ep_history] # Compute the discounted reward r_j = discount_rewards(r_j) s_j = np.reshape(np.vstack(s_j), [-1, 80, 80, 4]) feed_dict = { myAgent.reward_holder: r_j, myAgent.action_holder: a_j, myAgent.state_in: s_j } grads = sess.run(myAgent.gradients, feed_dict=feed_dict) for idx, grad in enumerate(grads): gradBuffer[idx] += grad feed_dict = dictionary = dict( zip(myAgent.gradient_holders, gradBuffer)) _ = sess.run(myAgent.update_batch, feed_dict=feed_dict) # Clean the grad buffer for ix, grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 ep_history = [] # Update the state. s_t = s_t1 # Update the number of iterations. t += 1 # Save a checkpoint every 10000 iterations. if t % 10000 == 0: saver.save(sess, 'saved_networks_policy_gradient/' + GAME + '-dqn', global_step=t) if t % 2000000 == 0: plt.plot(np.arange(len(score)), score) plt.ylabel('Total reward') plt.xlabel('episode') plt.savefig("policy_gradient.png") plt.show() # Print info. print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
def trainNetwork(s, readout, sess): """ Train the artificial agent using Q-learning to play the pong game. Args: s: the current state formed by 4 frames of the playground. readout: the Q value for each passible action in the current state. sess: session """ # Placeholder for the action. a = tf.placeholder("float", [None, ACTIONS]) # Placeholder for the target Q value. y = tf.placeholder("float", [None]) # Compute the loss. cost = compute_cost(y, a, readout) # Training operation. train_step = tf.train.AdamOptimizer(Lr).minimize(cost) # Open up a game state to communicate with emulator. game_state = game.GameState() # Initialize the replay memory. D = deque() # Initialize the action vector. do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 # Initialize the state of the game. x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # Save and load model checkpoints. saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks_q_learning") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # Initialize the epsilon value for the exploration phase. epsilon = INITIAL_EPSILON # Initialize the iteration counter. t = 0 score = [] max_score = 0 while True: # Choose an action epsilon-greedily. readout_t = readout.eval(feed_dict={s: [s_t]})[0] action_index = get_action_index(readout_t, epsilon, t) a_t = np.zeros([ACTIONS]) a_t[action_index] = 1 # Scale down epsilon during the exploitation phase. epsilon = scale_down_epsilon(epsilon, t) #run the selected action and update the replay memeory for i in range(0, K): # Run the selected action and observe next state and reward. s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state) if (terminal): score.append(game_state.bar1_pre - game_state.bar2_pre) # Store the transition in the replay memory D. D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # Start training once the observation phase is over. if (t > OBSERVE): # Sample a minibatch to train on. minibatch = random.sample(D, BATCH) # Get the batch variables. s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] # Compute the target Q-Value readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) target_q_batch = compute_target_q(s_j1_batch, r_batch, readout_j1_batch, minibatch) # Perform gradient step. train_step.run(feed_dict={ y: target_q_batch, a: a_batch, s: s_j_batch }) # Update the state. s_t = s_t1 # Update the number of iterations. t += 1 # Save a checkpoint every 10000 iterations. if t % 10000 == 0: saver.save(sess, 'saved_networks_q_learning/' + GAME + '-dqn', global_step=t) # Print info. state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if t % 1000000 == 0: plt.plot(np.arange(len(score)), score) plt.ylabel('Total reward') plt.xlabel('episode') plt.savefig("q_learning.png") plt.show() print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
def trainNetwork(s, readout, sess): """ Train the artificial agent using Q-learning to play the pong game. Args: s: the current state formed by 4 frames of the playground. readout: the Q value for each passible action in the current state. sess: session """ # Training operation. train_step = tf.train.AdamOptimizer(Lr).minimize(cost) # Open up a game state to communicate with emulator. game_state = game.GameState() # Initialize the replay memory. D = deque() # Initialize the action vector. do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 # Initialize the state of the game. s_t = np.array([0.5, 0.5, 0.03, 0.01, 0.5 - paddle_height / 2]) # Save and load model checkpoints. saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks_q_learning") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # Initialize the epsilon value for the exploration phase. epsilon = INITIAL_EPSILON # Initialize the iteration counter. t = 0 while True: # Choose an action epsilon-greedily. readout_t = readout.eval(feed_dict={s: [s_t]})[0] action_index = get_action_index(readout_t, epsilon, t) a_t = np.zeros([ACTIONS]) a_t[action_index] = 1 # Scale down epsilon during the exploitation phase. epsilon = scale_down_epsilon(epsilon, t) # Run the selected action and update the replay memeory for i in range(0, K): # Run the selected action and observe next state and reward. s_t1, r_t, terminal = run_selected_action(a_t, s_t, game_state) # Store the transition in the replay memory D. D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # Start training once the observation phase is over. if (t > OBSERVE): # Sample a minibatch to train on. minibatch = random.sample(D, BATCH) # Get the batch variables. s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] terminal_batch = [d[4] for d in minibatch] # Compute the target Q-Value readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) target_q_batch = compute_target_q(r_batch, readout_j1_batch, terminal_batch) # Perform gradient step. train_step.run(feed_dict={ y: target_q_batch, a: a_batch, s: s_j_batch }) # Update the state. s_t = s_t1 # Update the number of iterations. t += 1 # Save a checkpoint every 10000 iterations. if t % 10000 == 0: saver.save(sess, 'saved_networks_q_learning/' + GAME + '-dqn', global_step=t) # Print info. state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))