j = 0 actions = [] # The Q-Network while j < max_epLength: # If the network takes more moves than needed for the field, cancel episode j += 1 # Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: a = np.random.randint(0, num_actions) else: a = sess.run(mainQN.predict, feed_dict={ mainQN.input: np.reshape(s, [-1, field_size, field_size, 2]) })[0] y, x = np.unravel_index(a, (field_size, field_size)) s1, r, d = game.action(y, x, False) total_steps += 1 episodeBuffer.add( np.reshape(np.array([s, a, r, s1, d]), [1, 5])) # Save experience to episode buffer. if total_steps > pre_train_steps and total_steps % update_freq: trainBatch = myBuffer.sample( batch_size) # Get a random batch of experiences. # Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict, feed_dict={ mainQN.input: np.reshape(np.stack(trainBatch[:, 3]), [-1, field_size, field_size, 2]) })