r_ = -1.0

                            replay_buffer.add(new_inputs, a, r_, new_inputs_)
                    else:
                        g_ = episode_experience[bit_size - 1][3]
                        new_inputs = np.concatenate([s, g_], axis=-1)
                        new_inputs_ = np.concatenate([s_, g_], axis=-1)
                        if (np.array(s_) == np.array(g_)).all():
                            r_ = 0.0
                        else:
                            r_ = -1.0
                        replay_buffer.add(new_inputs, a, r_, new_inputs_)

        losses = []
        for k in range(optimisation_steps):
            state, action, reward, next_state = replay_buffer.sample()
            target_net_Q = sess.run(target_model.out,
                                    feed_dict={target_model.inp: next_state})

            if FLAGS.DDQN:
                main_net_predict = sess.run(model.predict,
                                            feed_dict={model.inp: next_state})
                doubleQ = np.reshape(
                    target_net_Q[range(main_net_predict.shape[0]),
                                 main_net_predict], [-1])
                target_reward = np.clip(
                    np.reshape(reward, [-1]) + gamma * doubleQ,
                    -1. / (1 - gamma), 0)
            else:
                target_reward = np.clip(
                    np.reshape(reward, [-1]) +