mdp.add_frame(st_next) st_next = mdp.get_MDP_state() if Done: dt = 1 else: dt = 0 totalR += rt # store transition R.StoreTransition(st, at, np.array([rt]), st_next, dt) st = st_next if episode_i > OBSERVATION_PHASE: for mini_batch in xrange(BATCHES): # sample mini batch s_batch, a_batch, r_batch, stag_batch, terminal_batch, _ = R.SampleMiniBatch( MINI_BATCH) Q_next = Critic.target_predict( stag_batch, Actor.target_predict(stag_batch)) Y = r_batch + GAMMA * Q_next * (1 - terminal_batch) Critic.train(Y, s_batch, a_batch) a_for_grad = Actor.predict(s_batch) grads = Critic.gradients(s_batch, a_batch) Actor.train(s_batch, grads) Actor.target_train() Critic.target_train() if Done is True:
st_next = mdp.get_MDP_state() if Done: dt = 1 else: dt = 0 totalR += rt # store transition R.StoreTransition(st, np.array([a_index]), np.array([rt]), st_next, dt) st = st_next E_local = [0] if episode_i > OBSERVATION_PHASE: for mini_batch in xrange(BATCHES): # sample mini batch s_batch, a_batch, r_batch, stag_batch, terminal_batch, _ = R.SampleMiniBatch( MINI_BATCH) Y = Q.evaluate(sess, s_batch) #Q_next_arg = Q.evaluate(sess, stag_batch) #Q_next_argmax = np.argmax(Q_next_arg,1) #Q_next_target = Q_target.evaluate(sess, stag_batch) #a_batch = a_batch.astype(int) #for i in range(MINI_BATCH): # Y[i,a_batch[i,0]] = r_batch[i,0] + GAMMA*Q_next_target[i,Q_next_argmax[i]] * (1-terminal_batch[i]) #error = Q.train(sess, s_batch, Y) # old DQN Q_next = Q_target.evaluate(sess, stag_batch)