dt = 1 else: dt = 0 totalR += rt # store transition if NEW_EXPERIENCE: R.StoreTransition(st, np.array([a_index]), np.array([rt]), st_next, dt) st = st_next E_local=[0] if episode_i > OBSERVATION_PHASE: E_local=[] for mini_batch in xrange(BATCHES): # sample mini batch s_batch, a_batch, r_batch, stag_batch, terminal_batch, num = R.SampleMiniBatch(MINI_BATCH) Y = Q.evaluate(sess, s_batch) # Double DQN update #Q_next_arg = Q.evaluate(sess, stag_batch) #Q_next_argmax = np.argmax(Q_next_arg,1) #Q_next_target = Q_target.evaluate(sess, stag_batch) #a_batch = a_batch.astype(int) #for i in range(num): # Y[i,a_batch[i,0]] = r_batch[i,0] + GAMMA*Q_next_target[i,Q_next_argmax[i]] * (1-terminal_batch[i]) #if ONLY_OUTPUT: # error = Q.train_output(sess, s_batch, Y) #else: # error = Q.train(sess, s_batch, Y)
simulator.SimulateNeuralEpisode(Q, sess, env_middle, False)) L.AddRecord('network_right', simulator.SimulateNeuralEpisode(Q, sess, env_right, False)) L.AddRecord( 'policy_left', simulator.SimulatePolicyEpisode(policy, discretizer, env_left, False)) L.AddRecord( 'policy_middle', simulator.SimulatePolicyEpisode(policy, discretizer, env_middle, False)) L.AddRecord( 'policy_right', simulator.SimulatePolicyEpisode(policy, discretizer, env_right, False)) L.AddRecord('total_reward', totalR) L.AddRecord('error', totalE) s_est, _, _, _, _, num = R_val.SampleMiniBatch(V_EST) Q_est_arg = Q.evaluate(sess, s_est) Q_est_argmax = np.argmax(Q_est_arg, 1) * 1.0 V_est = Q_est_argmax.sum() / num * 1.0 L.AddRecord('estimated_value', V_est) # update target network if steps >= C_STEPS: Ws, bs = Q.get_weights() Q_target.assign(sess, Ws, bs) print('updating traget network') steps = 0 steps += 1 # update reward log if onPolicy == False: