r_ = -1.0 replay_buffer.add(new_inputs, a, r_, new_inputs_) else: g_ = episode_experience[bit_size - 1][3] new_inputs = np.concatenate([s, g_], axis=-1) new_inputs_ = np.concatenate([s_, g_], axis=-1) if (np.array(s_) == np.array(g_)).all(): r_ = 0.0 else: r_ = -1.0 replay_buffer.add(new_inputs, a, r_, new_inputs_) losses = [] for k in range(optimisation_steps): state, action, reward, next_state = replay_buffer.sample() target_net_Q = sess.run(target_model.out, feed_dict={target_model.inp: next_state}) if FLAGS.DDQN: main_net_predict = sess.run(model.predict, feed_dict={model.inp: next_state}) doubleQ = np.reshape( target_net_Q[range(main_net_predict.shape[0]), main_net_predict], [-1]) target_reward = np.clip( np.reshape(reward, [-1]) + gamma * doubleQ, -1. / (1 - gamma), 0) else: target_reward = np.clip( np.reshape(reward, [-1]) +