train_batch = exp_buffer.sample(batch_size) pred_act, _ = main_qn.predict_act( np.vstack(train_batch[:, 3]), batch_size, sess) _, q_vals = target_qn.predict_act( np.vstack(train_batch[:, 3]), batch_size, sess) end_multiplier = -(train_batch[:, 4] - 1) double_q = q_vals[range(batch_size), pred_act] target_q_val = train_batch[:, 2] + gamma * double_q * end_multiplier in_frames = np.vstack(train_batch[:, 0]) acts = train_batch[:, 1] main_qn.update_nn(in_frames, target_q_val, acts, batch_size, sess, summ_writer, step_value) step_value = sess.run(inc_global_step) s = s1 s_frame = s1_frame ep_rewards.append(reward) total_step += 1 if total_step % update_target_step == 0: sess.run(update_qn_op) if done: disc_r = discounted_reward(ep_rewards, gamma) score = discounted_reward(ep_rewards, 1)