def _link(self, prev_link, prev_precedence_weights, write_weights):
     """Calculates the new link graphs.
     For each write head, the link is a directed graph 
     (represented by a matrix with entries in range [0, 1]) 
     whose vertices are the memory locations, and an edge indicates temporal ordering of writes.
     Args:
         prev_link: `[batch_size, num_writes, memory_size, memory_size]` 
             previous link graphs for each write head.
         prev_precedence_weights: `[batch_size, num_writes, memory_size]` 
             which is the previous "aggregated" write weights for each write head.
         write_weights: `[batch_size, num_writes, memory_size]`
             containing the new locations in memory written to.
     Returns:
         new link graphs for each write head `[batch_size, num_writes, memory_size, memory_size]`
     """
     with tf.name_scope('link'):
         batch_size = tf.shape(prev_link)[0]
         write_weights_i = tf.expand_dims(write_weights, 3)
         write_weights_j = tf.expand_dims(write_weights, 2)
         prev_precedence_weights_j = tf.expand_dims(prev_precedence_weights,
                                                    2)
         prev_link_scale = 1 - write_weights_i - write_weights_j
         new_link = write_weights_i * prev_precedence_weights_j
         link = prev_link_scale * prev_link + new_link
         # Return the link with the diagonal set to zero, to remove self-looping edges.
         return tf.matrix_set_diag(
             link,
             tf.zeros([batch_size, self._num_writes, self._memory_size],
                      dtype=link.dtype))
    def grad(grad_e, grad_v):
        """Gradient for SelfAdjointEigV2."""
        with tf.control_dependencies([grad_e, grad_v]):
            ediffs = tf.expand_dims(e, -2) - tf.expand_dims(e, -1)

            # Avoid NaNs from reciprocals when eigenvalues are close.
            safe_recip = tf.where(ediffs**2 < 1e-10, tf.zeros_like(ediffs),
                                  tf.reciprocal(ediffs))
            f = tf.matrix_set_diag(safe_recip, tf.zeros_like(e))
            grad_a = tf.matmul(
                v,
                tf.matmul(tf.matrix_diag(grad_e) +
                          f * tf.matmul(v, grad_v, adjoint_a=True),
                          v,
                          adjoint_b=True))
        # The forward op only depends on the lower triangular part of a, so here we
        # symmetrize and take the lower triangle
        grad_a = tf.linalg.band_part(grad_a + tf.linalg.adjoint(grad_a), -1, 0)
        grad_a = tf.linalg.set_diag(grad_a, 0.5 * tf.matrix_diag_part(grad_a))
        return grad_a
예제 #3
0
    def __init__(self, posts, **kwargs):
        FactorisedPosterior.__init__(self, posts, **kwargs)

        # The full covariance matrix is formed from the Cholesky decomposition
        # to ensure that it remains positive definite.
        #
        # To achieve this, we have to create PxP tensor variables for
        # each parameter vertex, but we then extract only the lower triangular
        # elements and train only on these. The diagonal elements
        # are constructed by the FactorisedPosterior
        if kwargs.get("init", None):
            # We are initializing from an existing posterior.
            # The FactorizedPosterior will already have extracted the mean and
            # diagonal of the covariance matrix - we need the Cholesky decomposition
            # of the covariance to initialize the off-diagonal terms
            self.log.info(" - Initializing posterior covariance from input posterior")
            _mean, cov = kwargs["init"]
            covar_init = tf.cholesky(cov)
        else:
            covar_init = tf.zeros([self.nvertices, self.nparams, self.nparams], dtype=tf.float32)

        self.off_diag_vars_base = self.log_tf(tf.Variable(covar_init, validate_shape=False,
                                                     name='%s_off_diag_vars' % self.name))
        if kwargs.get("suppress_nan", True):
            self.off_diag_vars = tf.where(tf.is_nan(self.off_diag_vars_base), tf.zeros_like(self.off_diag_vars_base), self.off_diag_vars_base)
        else:
            self.off_diag_vars = self.off_diag_vars_base
        self.off_diag_cov_chol = tf.matrix_set_diag(tf.matrix_band_part(self.off_diag_vars, -1, 0),
                                                    tf.zeros([self.nvertices, self.nparams]),
                                                    name='%s_off_diag_cov_chol' % self.name)

        # Combine diagonal and off-diagonal elements into full matrix
        self.cov_chol = tf.add(tf.matrix_diag(self.std), self.off_diag_cov_chol,
                               name='%s_cov_chol' % self.name)

        # Form the covariance matrix from the chol decomposition
        self.cov = tf.matmul(tf.transpose(self.cov_chol, perm=(0, 2, 1)), self.cov_chol,
                             name='%s_cov' % self.name)

        self.cov_chol = self.log_tf(self.cov_chol)
        self.cov = self.log_tf(self.cov)
예제 #4
0
def linear_covariance(x_mean, x_cov, A, b):
    x_var_diag = tf.matrix_diag_part(x_cov)
    xx_mean = x_var_diag + x_mean * x_mean

    term1_diag = tf.matmul(xx_mean, A.var)

    flat_xCov = tf.reshape(x_cov, [-1, A.shape[0]])  # [b*x, x]
    xCov_A = tf.matmul(flat_xCov, A.mean)  # [b*x, y]
    xCov_A = tf.reshape(xCov_A, [-1, A.shape[0], A.shape[1]])  # [b, x, y]
    xCov_A = tf.transpose(xCov_A, [0, 2, 1])  # [b, y, x]
    xCov_A = tf.reshape(xCov_A, [-1, A.shape[0]])  # [b*y, x]
    A_xCov_A = tf.matmul(xCov_A, A.mean)  # [b*y, y]
    A_xCov_A = tf.reshape(A_xCov_A, [-1, A.shape[1], A.shape[1]])  # [b, y, y]

    term2 = A_xCov_A
    term2_diag = tf.matrix_diag_part(term2)

    term3_diag = b.var

    result_diag = term1_diag + term2_diag + term3_diag
    return tf.matrix_set_diag(term2, result_diag)
예제 #5
0
def model_definition(vector_dimension,
                     label_count,
                     slot_vectors,
                     value_vectors,
                     use_delex_features=False,
                     use_softmax=True,
                     value_specific_decoder=False,
                     learn_belief_state_update=True):
    """
    This method defines the model and returns the required TensorFlow operations.

    slot_vectors, value_vectors should be of size [label_count + 2, 300].
    For None, we should just pass zero vectors for both. 

    Then, replicate using these vectors the old NBT and then combine each value's (including NONE) into softmax. 


    List of values learned by this model: 

    1) h_utterance_representation, which uses a CNN to learn a representation of the utterance r.  
    2) candidates_transform, which includes w_candidates and b_candidates, which transforms candidate values to vector c.
    3) w_joint_hidden_layer and b_joint_hidden_layer, which collapses the interaction of r and c to an intermediate vector. 
    4) w_joint_presoftmax and b_joint_presoftmax, which collapse the intermediate layer to a single feature. 
    5) sysreq_w_hidden_layer and sysreq_b_hidden_layer, which compute intermediate sysreq representation.
    6) TODO: sysreq_w_softmax and sysreq_b_softmax, which map this to final decision. -- currently not size independent. 
    7) TODO: confirm_w1_hidden_layer, confirm_b1_hidden_layer, confirm_w1_softmax, confirm_b1_softmax: for confirmations. -- currently does not work. 
    8) a_memory, b_memory, a_current, b_current: for the belief state updates, composed into matrix.   

    If all of these are initialised and then supplied to each of the models, we could train them together (batch of each slot), and just save
    these variables, then at test time, just load them (as session even), and then initialise all of the models with them. 

    """

    print "=========================== Model declaration ==========================="
    if use_softmax:
        label_size = label_count + 1  # 1 is for NONE, dontcare is added to the ontology.
    else:
        label_size = label_count

    # these are actual NN hyperparameters that we might want to tune at some point:
    hidden_units_1 = 100
    longest_utterance_length = 40

    summary_feature_count = 10

    print "Hidden layer size:", hidden_units_1, "Label Size:", label_size, "Use Softmax:", use_softmax, "Use Delex Features:", use_delex_features

    utterance_representations_full = tf.placeholder(
        tf.float32, [None, 40, vector_dimension
                     ])  # full feature vector, which we want to convolve over.
    utterance_representations_delex = tf.placeholder(tf.float32,
                                                     [None, label_size])
    #    utterance_representations_delex = tf.placeholder(tf.float32, [None, label_size, 40, vector_dimension])

    system_act_slots = tf.placeholder(
        tf.float32,
        shape=(None, vector_dimension))  # just slots, for requestables.

    system_act_confirm_slots = tf.placeholder(tf.float32,
                                              shape=(None, vector_dimension))
    system_act_confirm_values = tf.placeholder(tf.float32,
                                               shape=(None, vector_dimension))

    #slot_values =  tf.placeholder(tf.float32, shape=(None, vector_dimension))
    #candidate_values = tf.placeholder(tf.float32, shape=(None, vector_dimension))

    # Initial (distributional) vectors. Needed for L2 regularisation.
    W_slots = tf.constant(slot_vectors, name="W_init")
    W_values = tf.constant(value_vectors, name="W_init")

    # output label, i.e. True / False, 1-hot encoded:
    y_ = tf.placeholder(tf.float32, [None, label_size])

    y_past_state = tf.placeholder(tf.float32, [None, label_size])

    # dropout placeholder, 0.5 for training, 1.0 for validation/testing:
    keep_prob = tf.placeholder("float")

    # constants useful for evaluation variables further below:
    ones = tf.constant(1.0, dtype="float")
    zeros = tf.constant(0.0, dtype="float")

    hidden_utterance_size = vector_dimension

    filter_sizes = [1, 2, 3]
    num_filters = 300
    hidden_utterance_size = num_filters  #* len(filter_sizes)

    #candidate_sum = candidate_values + slot_values # to avoid summing these two multiple times later.

    #w_candidates = tf.Variable(tf.random_normal([vector_dimension, vector_dimension]))
    #b_candidates = tf.Variable(tf.zeros([vector_dimension]))

    #candidates = tf.nn.sigmoid(tf.matmul(candidate_sum, w_candidates) + b_candidates)
    #candidates = tf.nn.sigmoid(tf.matmul(candidate_values, w_candidates) + b_candidates)

    # filter needs to be of shape: filter_height = 1,2,3, filter_width=300, in_channel=1, out_channel=num_filters
    # filter just dot products - in images these then overlap from different regions - we don't have that.
    h_utterance_representation = define_CNN_model(
        utterance_representations_full, num_filters, vector_dimension,
        longest_utterance_length)

    #candidate_sum = W_slots + W_values # size [label_size, vector_dimension]

    w_candidates = tf.Variable(
        tf.random_normal([vector_dimension, vector_dimension]))
    b_candidates = tf.Variable(tf.zeros([vector_dimension]))

    # multiply to get: [label_size, vector_dimension]
    candidates_transform = tf.nn.sigmoid(
        tf.matmul(W_values, w_candidates) + b_candidates)

    # Next, multiply candidates [label_size, vector_dimension] each with the uttereance representations [None, vector_dimension], to get [None, label_size, vector_dimension]
    # or utterance [None, vector_dimension] X [vector_dimension, label_size] to get [None, label_size]
    #h_utterance_representation_candidate_interaction = tf.Variable(tf.zeros([None, label_size, vector_dimension]))

    list_of_value_contributions = []

    # get interaction of utterance with each value:
    for value_idx in range(0, label_count):
        list_of_value_contributions.append(
            tf.multiply(h_utterance_representation,
                        candidates_transform[value_idx, :]))

    h_utterance_representation_candidate_interaction = tf.reshape(
        tf.transpose(tf.stack(list_of_value_contributions), [1, 0, 2]),
        [-1, vector_dimension])
    # the same transform now runs across each value's vector, multiplying.
    w_joint_hidden_layer = tf.Variable(
        tf.random_normal([vector_dimension, hidden_units_1]))
    b_joint_hidden_layer = tf.Variable(tf.zeros([hidden_units_1]))

    # now multiply [None, label_size, vector_dimension] by [vector_dimension, hidden_units_1], to get [None, label_size, hidden_units_1]
    hidden_layer_joint = tf.nn.sigmoid(
        tf.reshape(
            tf.matmul(h_utterance_representation_candidate_interaction,
                      w_joint_hidden_layer) + b_joint_hidden_layer,
            [-1, label_count, hidden_units_1]))
    hidden_layer_joint_with_dropout = tf.nn.dropout(hidden_layer_joint,
                                                    keep_prob)

    # next initialise parameters that go into a softmax, i.e. mapping [None, label_size, hidden_units_1] -> [None, label_size]
    w_joint_presoftmax = tf.Variable(tf.random_normal([hidden_units_1,
                                                       1]))  # collapse to 1
    b_joint_presoftmax = tf.Variable(tf.zeros([1]))  # collapse to 1

    y_presoftmax = tf.reshape(
        tf.matmul(
            tf.reshape(hidden_layer_joint_with_dropout, [-1, hidden_units_1]),
            w_joint_presoftmax) + b_joint_presoftmax, [-1, label_count])

    # for now we do not implement this

    sysreq_contributions = []  # a list of contributions for each of the values
    confirm_contributions = [
    ]  # a list of contributions for each of the values

    # =================== NETWORK FOR SYSTEM REQUESTS ==========================

    # is the current slot offered
    system_act_candidate_interaction = tf.multiply(
        W_slots[0, :],
        system_act_slots)  # only multiply with slots for the requests.
    dot_product_sysreq = tf.reduce_mean(system_act_candidate_interaction, 1)

    #full_ones = tf.ones([tf.shape(dot_product_sysreq)[0], 1])
    #dot_product = tf.cast(tf.equal(dot_product_sysreq, full_ones), "float32")

    decision = tf.multiply(tf.expand_dims(dot_product_sysreq, 1),
                           h_utterance_representation)

    sysreq_w_hidden_layer = tf.Variable(
        tf.random_normal([vector_dimension, hidden_units_1]))
    sysreq_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1]))

    # allow each value to learn to map different utterances to yes. Mainly dontcare.
    for value_idx in range(0, label_count):

        sysreq_hidden_layer_1 = tf.nn.sigmoid(
            tf.matmul(decision, sysreq_w_hidden_layer) + sysreq_b_hidden_layer)
        sysreq_hidden_layer_1_with_dropout = tf.nn.dropout(
            sysreq_hidden_layer_1, keep_prob)

        sysreq_w_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1]))
        sysreq_b_softmax = tf.Variable(tf.zeros([1]))

        sysreq_contribution = tf.matmul(sysreq_hidden_layer_1_with_dropout,
                                        sysreq_w_softmax) + sysreq_b_softmax

        sysreq_contributions.append(sysreq_contribution)

    sysreq = tf.concat(sysreq_contributions, 1)  #, [-1, label_size])

    # =================== NETWORK FOR CONFIRMATIONS ==========================

    # here, we do want to tie across all values, as it will get a different signal depending on whether both things match.
    confirm_w1_hidden_layer = tf.Variable(
        tf.random_normal([vector_dimension, hidden_units_1]))
    confirm_b1_hidden_layer = tf.Variable(tf.zeros([hidden_units_1]))

    confirm_w1_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1]))
    confirm_b1_softmax = tf.Variable(tf.zeros([1]))

    for value_idx in range(0, label_count):

        dot_product = tf.multiply(
            tf.reduce_mean(
                tf.multiply(W_slots[0, :], system_act_confirm_slots), 1),
            tf.reduce_mean(
                tf.multiply(W_values[value_idx, :], system_act_confirm_values),
                1))  # dot product: slot equality and value equality

        full_ones = tf.ones(tf.shape(dot_product))
        dot_product = tf.cast(tf.equal(dot_product, full_ones), "float32")

        decision = tf.multiply(tf.expand_dims(dot_product, 1),
                               h_utterance_representation)

        confirm_hidden_layer_1 = tf.nn.sigmoid(
            tf.matmul(decision, confirm_w1_hidden_layer) +
            confirm_b1_hidden_layer)
        confirm_hidden_layer_1_with_dropout = tf.nn.dropout(
            confirm_hidden_layer_1, keep_prob)

        confirm_contribution = tf.matmul(
            confirm_hidden_layer_1_with_dropout,
            confirm_w1_softmax) + confirm_b1_softmax
        confirm_contributions.append(confirm_contribution)

    sysconf = tf.concat(confirm_contributions, 1)

    if use_softmax:

        append_zeros_none = tf.zeros([tf.shape(y_presoftmax)[0], 1])
        y_presoftmax = tf.concat([y_presoftmax, append_zeros_none], 1)

        append_zeros = tf.zeros([tf.shape(y_presoftmax)[0], 1])
        sysreq = tf.concat([sysreq, append_zeros], 1)
        sysconf = tf.concat([sysconf, append_zeros], 1)

        y_presoftmax = y_presoftmax + sysconf + sysreq

    if use_delex_features:
        y_presoftmax = y_presoftmax + utterance_representations_delex

    # value-specific decoder:
    if value_specific_decoder and False:

        h_utterance_representation_for_full_softmax = define_CNN_model(
            utterance_representations_full, num_filters, vector_dimension,
            longest_utterance_length)

        h_utterance_dropout = tf.nn.dropout(
            h_utterance_representation_for_full_softmax, keep_prob)

        ss_w_hidden_layer = tf.Variable(
            tf.random_normal([vector_dimension, hidden_units_1]))
        ss_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1]))

        ss_hidden_layer_1 = tf.nn.relu(
            tf.matmul(h_utterance_dropout, ss_w_hidden_layer) +
            ss_b_hidden_layer)
        ss_hidden_layer_1_with_dropout = tf.nn.dropout(ss_hidden_layer_1,
                                                       keep_prob)

        ss_w_softmax = tf.Variable(
            tf.random_normal([hidden_units_1, label_size]))
        ss_b_softmax = tf.Variable(tf.zeros([label_size]))

        ss_contribution = tf.matmul(ss_hidden_layer_1_with_dropout,
                                    ss_w_softmax) + ss_b_softmax

        y_presoftmax += ss_contribution

    # as we are returning always, can't be null
    update_coefficient = tf.constant(0.49)

    if use_softmax:

        if False:
            #rule based
            #if learn_belief_state_update:

            if value_specific_decoder:  # value-specific update

                update_coefficient = tf.constant(0.8)

                ss_W_memory = tf.Variable(
                    tf.random_normal([label_size, label_size]))

                ss_W_current = tf.Variable(
                    tf.random_normal([label_size, label_size]))

                y_combine = tf.matmul(y_past_state, ss_W_memory) + tf.matmul(
                    y_presoftmax, ss_W_current)

            else:
                #constrained
                update_coefficient = tf.constant(0.7)

                a_memory = tf.Variable(tf.random_normal([1, 1]))
                diag_memory = a_memory * tf.diag(tf.ones(label_size))

                b_memory = tf.Variable(tf.random_normal([1, 1]))
                non_diag_memory = tf.matrix_set_diag(
                    b_memory * tf.ones([label_size, label_size]),
                    tf.zeros(label_size))

                W_memory = diag_memory + non_diag_memory

                a_current = tf.Variable(tf.random_normal([1, 1]))
                diag_current = a_current * tf.diag(tf.ones(label_size))

                b_current = tf.Variable(tf.random_normal([1, 1]))
                non_diag_current = tf.matrix_set_diag(
                    b_current * tf.ones([label_size, label_size]),
                    tf.zeros(label_size))

                W_current = diag_current + non_diag_current

                y_combine = tf.matmul(y_past_state, W_memory) + tf.matmul(
                    y_presoftmax, W_current
                )  #+ tf.matmul(sysreq, W_current_req) + tf.matmul(sysconf, W_current_conf)

            y = tf.nn.softmax(y_combine)  # + y_ss_update_contrib)

        else:
            # This code runs the baseline experiments reported in Footnote 2 in the paper.
            update_coefficient = tf.Variable(
                0.5)  #this scales the contribution of the current turn.
            y_combine = update_coefficient * y_presoftmax + (
                1 - update_coefficient) * y_past_state
            y = tf.nn.softmax(y_combine)

    else:

        y = tf.nn.sigmoid(
            y_presoftmax
        )  # for requestables, we just have turn-level binary decisions

    # ======================== LOSS IS JUST CROSS ENTROPY ==========================================

    if use_softmax:
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            logits=y_combine, labels=y_)
    else:
        cross_entropy = tf.reduce_sum(tf.square(y - y_))

    # ============================= EVALUATION =====================================================

    if use_softmax:
        predictions = tf.cast(tf.argmax(y, 1),
                              "float32")  # will have ones where positive
        true_predictions = tf.cast(tf.argmax(y_, 1), "float32")
        correct_prediction = tf.cast(tf.equal(predictions, true_predictions),
                                     "float")

        accuracy = tf.reduce_mean(correct_prediction)
        # this will count number of positives - they are marked with 1 in true_predictions
        num_positives = tf.reduce_sum(true_predictions)
        # positives are indicated with ones.
        classified_positives = tf.reduce_sum(predictions)
        # will have ones in all places where both are predicting positives
        true_positives = tf.multiply(predictions, true_predictions)
        # if indicators for positive of both are 1, then it is positive.
        num_true_positives = tf.reduce_sum(true_positives)

        recall = num_true_positives / num_positives
        precision = num_true_positives / classified_positives
        f_score = (2 * recall * precision) / (recall + precision)

    else:
        predictions = tf.cast(tf.round(y),
                              "float32")  # will have ones where positive
        true_predictions = tf.cast(tf.round(y_), "float32")
        correct_prediction = tf.cast(tf.equal(predictions, true_predictions),
                                     "float")

        num_positives = tf.reduce_sum(true_predictions)

        classified_positives = tf.reduce_sum(predictions)
        true_positives = tf.multiply(predictions, true_predictions)
        num_true_positives = tf.reduce_sum(true_positives)
        recall = num_true_positives / num_positives
        precision = num_true_positives / classified_positives
        f_score = (2 * recall * precision) / (recall + precision)

        accuracy = tf.reduce_mean(correct_prediction)

    optimizer = tf.train.AdamOptimizer(0.001)
    train_step = optimizer.minimize(cross_entropy)

    return keep_prob, utterance_representations_full, utterance_representations_delex, \
            system_act_slots, system_act_confirm_slots, system_act_confirm_values, \
            y_, y_past_state, accuracy, f_score, precision, \
           recall, num_true_positives, num_positives, classified_positives, y, \
           predictions, true_predictions, correct_prediction, true_positives, train_step, update_coefficient
예제 #6
0
def iwae(p_z,
         p_x_given_z,
         q_z,
         observations,
         num_samples,
         cvs,
         contexts=None,
         antithetic=False):
    """Computes a gradient of the IWAE estimator.

  Args:
    p_z: The prior. Should be a callable that optionally accepts a conditioning
      context and returns a tfp.distributions.Distribution which has the
      log_prob and sample methods implemented. The distribution should be over a
      [batch_size, latent_dim] space.
    p_x_given_z: The likelihood. Should be a callable that accepts as input a
      tensor of shape [num_samples, batch_size, latent_size + context_size] and
      returns a tfd.Distribution over a [num_samples, batch_size, data_dim]
      space.
    q_z: The proposal, should be a callable which accepts a batch of
      observations of shape [batch_size, data_dim] and returns a distribution
      over [batch_size, latent_dim].
    observations: A float Tensor of shape [batch_size, data_dim] containing the
      observations.
    num_samples: The number of samples for the IWAE estimator.
    cvs: Control variate variables.
    contexts: A float Tensor of shape [batch_size, context_dim] containing the
      contexts. (Optionally, none)
    antithetic: Whether to use antithetic sampling.

  Returns:
    estimators: Dictionary of tuples (objective, neg_model_loss,
      neg_inference_network_loss).
  """
    alpha, beta, gamma, delta = cvs
    batch_size = tf.shape(observations)[0]
    proposal = q_z(observations, contexts, stop_gradient=False)
    # [num_samples, batch_size, latent_size]

    # If antithetic sampling, draw half of the samples and use the antithetics
    # for the other half.
    if antithetic:
        z_pos = proposal.sample(sample_shape=[num_samples // 2])
        z_neg = 2 * proposal.loc - z_pos
        z = tf.concat((z_pos, z_neg), axis=0)
    else:
        z = proposal.sample(sample_shape=[num_samples])

    tiled_contexts = None
    if contexts is not None:
        tiled_contexts = tf.tile(tf.expand_dims(contexts, 0),
                                 [num_samples, 1, 1])
    likelihood = p_x_given_z(z, tiled_contexts)
    # Before reduce_sum is [num_samples, batch_size, latent_dim].
    # Sum over the latent dim.
    log_q_z = tf.reduce_sum(proposal.log_prob(z), axis=-1)
    # Before reduce_sum is  [num_samples, batch_size, latent_dim].
    # Sum over latent dim.
    prior = p_z(contexts)
    log_p_z = tf.reduce_sum(prior.log_prob(z), axis=-1)
    # Before reduce_sum is [num_samples, batch_size, data_dim]
    log_p_x_given_z = tf.reduce_sum(likelihood.log_prob(observations), axis=-1)

    log_weights = log_p_z + log_p_x_given_z - log_q_z
    log_sum_weight = tf.reduce_logsumexp(log_weights, axis=0)
    log_avg_weight = log_sum_weight - tf.log(tf.to_float(num_samples))
    normalized_weights = tf.stop_gradient(tf.nn.softmax(log_weights, axis=0))

    if FLAGS.image_summary:
        best_index = tf.to_int32(tf.argmax(normalized_weights, axis=0))
        indices = tf.stack((best_index, tf.range(0, batch_size)), axis=-1)
        best_images = tf.gather_nd(likelihood.probs_parameter(), indices)

        if FLAGS.dataset == "struct_mnist":
            tf.summary.image("bottom_half",
                             tf.reshape(best_images, [batch_size, -1, 28, 1]))
        else:
            tf.summary.image("output",
                             tf.reshape(best_images, [batch_size, -1, 28, 1]))
        tf.summary.image("input",
                         tf.reshape(observations, [batch_size, -1, 28, 1]))

    # Compute gradient estimators
    model_loss = log_avg_weight
    estimators = {}

    estimators["iwae"] = (log_avg_weight, log_avg_weight, log_avg_weight)

    stopped_z_log_q_z = tf.reduce_sum(proposal.log_prob(tf.stop_gradient(z)),
                                      axis=-1)
    estimators["rws"] = (log_avg_weight, model_loss,
                         tf.reduce_sum(normalized_weights * stopped_z_log_q_z,
                                       axis=0))

    # Doubly reparameterized
    stopped_proposal = q_z(observations, contexts, stop_gradient=True)
    stopped_log_q_z = tf.reduce_sum(stopped_proposal.log_prob(z), axis=-1)
    stopped_log_weights = log_p_z + log_p_x_given_z - stopped_log_q_z
    sq_normalized_weights = tf.square(normalized_weights)

    estimators["stl"] = (log_avg_weight, model_loss,
                         tf.reduce_sum(normalized_weights *
                                       stopped_log_weights,
                                       axis=0))
    estimators["dreg"] = (log_avg_weight, model_loss,
                          tf.reduce_sum(sq_normalized_weights *
                                        stopped_log_weights,
                                        axis=0))
    estimators["rws-dreg"] = (
        log_avg_weight, model_loss,
        tf.reduce_sum(
            (normalized_weights - sq_normalized_weights) * stopped_log_weights,
            axis=0))

    # Add normed versions
    normalized_sq_normalized_weights = (
        sq_normalized_weights /
        tf.reduce_sum(sq_normalized_weights, axis=0, keepdims=True))
    estimators["dreg-norm"] = (log_avg_weight, model_loss,
                               tf.reduce_sum(normalized_sq_normalized_weights *
                                             stopped_log_weights,
                                             axis=0))

    rws_dregs_weights = normalized_weights - sq_normalized_weights
    normalized_rws_dregs_weights = rws_dregs_weights / tf.reduce_sum(
        rws_dregs_weights, axis=0, keepdims=True)
    estimators["rws-dreg-norm"] = (log_avg_weight, model_loss,
                                   tf.reduce_sum(normalized_rws_dregs_weights *
                                                 stopped_log_weights,
                                                 axis=0))

    estimators["dreg-alpha"] = (log_avg_weight, model_loss,
                                (1 - FLAGS.alpha) * estimators["dreg"][-1] +
                                FLAGS.alpha * estimators["rws-dreg"][-1])

    # Jackknife
    loo_log_weights = tf.tile(tf.expand_dims(tf.transpose(log_weights), -1),
                              [1, 1, num_samples])
    loo_log_weights = tf.matrix_set_diag(
        loo_log_weights, -np.inf * tf.ones([batch_size, num_samples]))
    loo_log_avg_weight = tf.reduce_mean(
        tf.reduce_logsumexp(loo_log_weights, axis=1) -
        tf.log(tf.to_float(num_samples - 1)),
        axis=-1)
    jk_model_loss = num_samples * log_avg_weight - (num_samples -
                                                    1) * loo_log_avg_weight

    estimators["jk"] = (jk_model_loss, jk_model_loss, jk_model_loss)

    # Compute JK w/ DReG for the inference network
    loo_normalized_weights = tf.reduce_mean(tf.square(
        tf.stop_gradient(tf.nn.softmax(loo_log_weights, axis=1))),
                                            axis=-1)
    estimators["jk-dreg"] = (
        jk_model_loss, jk_model_loss, num_samples *
        tf.reduce_sum(sq_normalized_weights * stopped_log_weights, axis=0) -
        (num_samples - 1) * tf.reduce_sum(
            tf.transpose(loo_normalized_weights) * stopped_log_weights, axis=0)
    )

    # Compute control variates
    loo_baseline = tf.expand_dims(tf.transpose(log_weights), -1)
    loo_baseline = tf.tile(loo_baseline, [1, 1, num_samples])
    loo_baseline = tf.matrix_set_diag(
        loo_baseline, -np.inf * tf.ones_like(tf.transpose(log_weights)))
    loo_baseline = tf.reduce_logsumexp(loo_baseline, axis=1)
    loo_baseline = tf.transpose(loo_baseline)

    learning_signal = tf.stop_gradient(tf.expand_dims(
        log_avg_weight, 0)) - (1 - gamma) * tf.stop_gradient(loo_baseline)
    vimco = tf.reduce_sum(learning_signal * stopped_z_log_q_z, axis=0)

    first_part = alpha * vimco + (1 - alpha) * tf.reduce_sum(
        normalized_weights * stopped_log_weights, axis=0)
    second_part = ((1 - beta) * (tf.reduce_sum(
        ((1 - delta) / tf.to_float(num_samples) - normalized_weights) *
        stopped_z_log_q_z,
        axis=0)) + beta * tf.reduce_sum(
            (sq_normalized_weights - normalized_weights) * stopped_log_weights,
            axis=0))
    estimators["dreg-cv"] = (log_avg_weight, model_loss,
                             first_part + second_part)

    return estimators