def _link(self, prev_link, prev_precedence_weights, write_weights): """Calculates the new link graphs. For each write head, the link is a directed graph (represented by a matrix with entries in range [0, 1]) whose vertices are the memory locations, and an edge indicates temporal ordering of writes. Args: prev_link: `[batch_size, num_writes, memory_size, memory_size]` previous link graphs for each write head. prev_precedence_weights: `[batch_size, num_writes, memory_size]` which is the previous "aggregated" write weights for each write head. write_weights: `[batch_size, num_writes, memory_size]` containing the new locations in memory written to. Returns: new link graphs for each write head `[batch_size, num_writes, memory_size, memory_size]` """ with tf.name_scope('link'): batch_size = tf.shape(prev_link)[0] write_weights_i = tf.expand_dims(write_weights, 3) write_weights_j = tf.expand_dims(write_weights, 2) prev_precedence_weights_j = tf.expand_dims(prev_precedence_weights, 2) prev_link_scale = 1 - write_weights_i - write_weights_j new_link = write_weights_i * prev_precedence_weights_j link = prev_link_scale * prev_link + new_link # Return the link with the diagonal set to zero, to remove self-looping edges. return tf.matrix_set_diag( link, tf.zeros([batch_size, self._num_writes, self._memory_size], dtype=link.dtype))
def grad(grad_e, grad_v): """Gradient for SelfAdjointEigV2.""" with tf.control_dependencies([grad_e, grad_v]): ediffs = tf.expand_dims(e, -2) - tf.expand_dims(e, -1) # Avoid NaNs from reciprocals when eigenvalues are close. safe_recip = tf.where(ediffs**2 < 1e-10, tf.zeros_like(ediffs), tf.reciprocal(ediffs)) f = tf.matrix_set_diag(safe_recip, tf.zeros_like(e)) grad_a = tf.matmul( v, tf.matmul(tf.matrix_diag(grad_e) + f * tf.matmul(v, grad_v, adjoint_a=True), v, adjoint_b=True)) # The forward op only depends on the lower triangular part of a, so here we # symmetrize and take the lower triangle grad_a = tf.linalg.band_part(grad_a + tf.linalg.adjoint(grad_a), -1, 0) grad_a = tf.linalg.set_diag(grad_a, 0.5 * tf.matrix_diag_part(grad_a)) return grad_a
def __init__(self, posts, **kwargs): FactorisedPosterior.__init__(self, posts, **kwargs) # The full covariance matrix is formed from the Cholesky decomposition # to ensure that it remains positive definite. # # To achieve this, we have to create PxP tensor variables for # each parameter vertex, but we then extract only the lower triangular # elements and train only on these. The diagonal elements # are constructed by the FactorisedPosterior if kwargs.get("init", None): # We are initializing from an existing posterior. # The FactorizedPosterior will already have extracted the mean and # diagonal of the covariance matrix - we need the Cholesky decomposition # of the covariance to initialize the off-diagonal terms self.log.info(" - Initializing posterior covariance from input posterior") _mean, cov = kwargs["init"] covar_init = tf.cholesky(cov) else: covar_init = tf.zeros([self.nvertices, self.nparams, self.nparams], dtype=tf.float32) self.off_diag_vars_base = self.log_tf(tf.Variable(covar_init, validate_shape=False, name='%s_off_diag_vars' % self.name)) if kwargs.get("suppress_nan", True): self.off_diag_vars = tf.where(tf.is_nan(self.off_diag_vars_base), tf.zeros_like(self.off_diag_vars_base), self.off_diag_vars_base) else: self.off_diag_vars = self.off_diag_vars_base self.off_diag_cov_chol = tf.matrix_set_diag(tf.matrix_band_part(self.off_diag_vars, -1, 0), tf.zeros([self.nvertices, self.nparams]), name='%s_off_diag_cov_chol' % self.name) # Combine diagonal and off-diagonal elements into full matrix self.cov_chol = tf.add(tf.matrix_diag(self.std), self.off_diag_cov_chol, name='%s_cov_chol' % self.name) # Form the covariance matrix from the chol decomposition self.cov = tf.matmul(tf.transpose(self.cov_chol, perm=(0, 2, 1)), self.cov_chol, name='%s_cov' % self.name) self.cov_chol = self.log_tf(self.cov_chol) self.cov = self.log_tf(self.cov)
def linear_covariance(x_mean, x_cov, A, b): x_var_diag = tf.matrix_diag_part(x_cov) xx_mean = x_var_diag + x_mean * x_mean term1_diag = tf.matmul(xx_mean, A.var) flat_xCov = tf.reshape(x_cov, [-1, A.shape[0]]) # [b*x, x] xCov_A = tf.matmul(flat_xCov, A.mean) # [b*x, y] xCov_A = tf.reshape(xCov_A, [-1, A.shape[0], A.shape[1]]) # [b, x, y] xCov_A = tf.transpose(xCov_A, [0, 2, 1]) # [b, y, x] xCov_A = tf.reshape(xCov_A, [-1, A.shape[0]]) # [b*y, x] A_xCov_A = tf.matmul(xCov_A, A.mean) # [b*y, y] A_xCov_A = tf.reshape(A_xCov_A, [-1, A.shape[1], A.shape[1]]) # [b, y, y] term2 = A_xCov_A term2_diag = tf.matrix_diag_part(term2) term3_diag = b.var result_diag = term1_diag + term2_diag + term3_diag return tf.matrix_set_diag(term2, result_diag)
def model_definition(vector_dimension, label_count, slot_vectors, value_vectors, use_delex_features=False, use_softmax=True, value_specific_decoder=False, learn_belief_state_update=True): """ This method defines the model and returns the required TensorFlow operations. slot_vectors, value_vectors should be of size [label_count + 2, 300]. For None, we should just pass zero vectors for both. Then, replicate using these vectors the old NBT and then combine each value's (including NONE) into softmax. List of values learned by this model: 1) h_utterance_representation, which uses a CNN to learn a representation of the utterance r. 2) candidates_transform, which includes w_candidates and b_candidates, which transforms candidate values to vector c. 3) w_joint_hidden_layer and b_joint_hidden_layer, which collapses the interaction of r and c to an intermediate vector. 4) w_joint_presoftmax and b_joint_presoftmax, which collapse the intermediate layer to a single feature. 5) sysreq_w_hidden_layer and sysreq_b_hidden_layer, which compute intermediate sysreq representation. 6) TODO: sysreq_w_softmax and sysreq_b_softmax, which map this to final decision. -- currently not size independent. 7) TODO: confirm_w1_hidden_layer, confirm_b1_hidden_layer, confirm_w1_softmax, confirm_b1_softmax: for confirmations. -- currently does not work. 8) a_memory, b_memory, a_current, b_current: for the belief state updates, composed into matrix. If all of these are initialised and then supplied to each of the models, we could train them together (batch of each slot), and just save these variables, then at test time, just load them (as session even), and then initialise all of the models with them. """ print "=========================== Model declaration ===========================" if use_softmax: label_size = label_count + 1 # 1 is for NONE, dontcare is added to the ontology. else: label_size = label_count # these are actual NN hyperparameters that we might want to tune at some point: hidden_units_1 = 100 longest_utterance_length = 40 summary_feature_count = 10 print "Hidden layer size:", hidden_units_1, "Label Size:", label_size, "Use Softmax:", use_softmax, "Use Delex Features:", use_delex_features utterance_representations_full = tf.placeholder( tf.float32, [None, 40, vector_dimension ]) # full feature vector, which we want to convolve over. utterance_representations_delex = tf.placeholder(tf.float32, [None, label_size]) # utterance_representations_delex = tf.placeholder(tf.float32, [None, label_size, 40, vector_dimension]) system_act_slots = tf.placeholder( tf.float32, shape=(None, vector_dimension)) # just slots, for requestables. system_act_confirm_slots = tf.placeholder(tf.float32, shape=(None, vector_dimension)) system_act_confirm_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) #slot_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) #candidate_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) # Initial (distributional) vectors. Needed for L2 regularisation. W_slots = tf.constant(slot_vectors, name="W_init") W_values = tf.constant(value_vectors, name="W_init") # output label, i.e. True / False, 1-hot encoded: y_ = tf.placeholder(tf.float32, [None, label_size]) y_past_state = tf.placeholder(tf.float32, [None, label_size]) # dropout placeholder, 0.5 for training, 1.0 for validation/testing: keep_prob = tf.placeholder("float") # constants useful for evaluation variables further below: ones = tf.constant(1.0, dtype="float") zeros = tf.constant(0.0, dtype="float") hidden_utterance_size = vector_dimension filter_sizes = [1, 2, 3] num_filters = 300 hidden_utterance_size = num_filters #* len(filter_sizes) #candidate_sum = candidate_values + slot_values # to avoid summing these two multiple times later. #w_candidates = tf.Variable(tf.random_normal([vector_dimension, vector_dimension])) #b_candidates = tf.Variable(tf.zeros([vector_dimension])) #candidates = tf.nn.sigmoid(tf.matmul(candidate_sum, w_candidates) + b_candidates) #candidates = tf.nn.sigmoid(tf.matmul(candidate_values, w_candidates) + b_candidates) # filter needs to be of shape: filter_height = 1,2,3, filter_width=300, in_channel=1, out_channel=num_filters # filter just dot products - in images these then overlap from different regions - we don't have that. h_utterance_representation = define_CNN_model( utterance_representations_full, num_filters, vector_dimension, longest_utterance_length) #candidate_sum = W_slots + W_values # size [label_size, vector_dimension] w_candidates = tf.Variable( tf.random_normal([vector_dimension, vector_dimension])) b_candidates = tf.Variable(tf.zeros([vector_dimension])) # multiply to get: [label_size, vector_dimension] candidates_transform = tf.nn.sigmoid( tf.matmul(W_values, w_candidates) + b_candidates) # Next, multiply candidates [label_size, vector_dimension] each with the uttereance representations [None, vector_dimension], to get [None, label_size, vector_dimension] # or utterance [None, vector_dimension] X [vector_dimension, label_size] to get [None, label_size] #h_utterance_representation_candidate_interaction = tf.Variable(tf.zeros([None, label_size, vector_dimension])) list_of_value_contributions = [] # get interaction of utterance with each value: for value_idx in range(0, label_count): list_of_value_contributions.append( tf.multiply(h_utterance_representation, candidates_transform[value_idx, :])) h_utterance_representation_candidate_interaction = tf.reshape( tf.transpose(tf.stack(list_of_value_contributions), [1, 0, 2]), [-1, vector_dimension]) # the same transform now runs across each value's vector, multiplying. w_joint_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) b_joint_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) # now multiply [None, label_size, vector_dimension] by [vector_dimension, hidden_units_1], to get [None, label_size, hidden_units_1] hidden_layer_joint = tf.nn.sigmoid( tf.reshape( tf.matmul(h_utterance_representation_candidate_interaction, w_joint_hidden_layer) + b_joint_hidden_layer, [-1, label_count, hidden_units_1])) hidden_layer_joint_with_dropout = tf.nn.dropout(hidden_layer_joint, keep_prob) # next initialise parameters that go into a softmax, i.e. mapping [None, label_size, hidden_units_1] -> [None, label_size] w_joint_presoftmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) # collapse to 1 b_joint_presoftmax = tf.Variable(tf.zeros([1])) # collapse to 1 y_presoftmax = tf.reshape( tf.matmul( tf.reshape(hidden_layer_joint_with_dropout, [-1, hidden_units_1]), w_joint_presoftmax) + b_joint_presoftmax, [-1, label_count]) # for now we do not implement this sysreq_contributions = [] # a list of contributions for each of the values confirm_contributions = [ ] # a list of contributions for each of the values # =================== NETWORK FOR SYSTEM REQUESTS ========================== # is the current slot offered system_act_candidate_interaction = tf.multiply( W_slots[0, :], system_act_slots) # only multiply with slots for the requests. dot_product_sysreq = tf.reduce_mean(system_act_candidate_interaction, 1) #full_ones = tf.ones([tf.shape(dot_product_sysreq)[0], 1]) #dot_product = tf.cast(tf.equal(dot_product_sysreq, full_ones), "float32") decision = tf.multiply(tf.expand_dims(dot_product_sysreq, 1), h_utterance_representation) sysreq_w_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) sysreq_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) # allow each value to learn to map different utterances to yes. Mainly dontcare. for value_idx in range(0, label_count): sysreq_hidden_layer_1 = tf.nn.sigmoid( tf.matmul(decision, sysreq_w_hidden_layer) + sysreq_b_hidden_layer) sysreq_hidden_layer_1_with_dropout = tf.nn.dropout( sysreq_hidden_layer_1, keep_prob) sysreq_w_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) sysreq_b_softmax = tf.Variable(tf.zeros([1])) sysreq_contribution = tf.matmul(sysreq_hidden_layer_1_with_dropout, sysreq_w_softmax) + sysreq_b_softmax sysreq_contributions.append(sysreq_contribution) sysreq = tf.concat(sysreq_contributions, 1) #, [-1, label_size]) # =================== NETWORK FOR CONFIRMATIONS ========================== # here, we do want to tie across all values, as it will get a different signal depending on whether both things match. confirm_w1_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) confirm_b1_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) confirm_w1_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) confirm_b1_softmax = tf.Variable(tf.zeros([1])) for value_idx in range(0, label_count): dot_product = tf.multiply( tf.reduce_mean( tf.multiply(W_slots[0, :], system_act_confirm_slots), 1), tf.reduce_mean( tf.multiply(W_values[value_idx, :], system_act_confirm_values), 1)) # dot product: slot equality and value equality full_ones = tf.ones(tf.shape(dot_product)) dot_product = tf.cast(tf.equal(dot_product, full_ones), "float32") decision = tf.multiply(tf.expand_dims(dot_product, 1), h_utterance_representation) confirm_hidden_layer_1 = tf.nn.sigmoid( tf.matmul(decision, confirm_w1_hidden_layer) + confirm_b1_hidden_layer) confirm_hidden_layer_1_with_dropout = tf.nn.dropout( confirm_hidden_layer_1, keep_prob) confirm_contribution = tf.matmul( confirm_hidden_layer_1_with_dropout, confirm_w1_softmax) + confirm_b1_softmax confirm_contributions.append(confirm_contribution) sysconf = tf.concat(confirm_contributions, 1) if use_softmax: append_zeros_none = tf.zeros([tf.shape(y_presoftmax)[0], 1]) y_presoftmax = tf.concat([y_presoftmax, append_zeros_none], 1) append_zeros = tf.zeros([tf.shape(y_presoftmax)[0], 1]) sysreq = tf.concat([sysreq, append_zeros], 1) sysconf = tf.concat([sysconf, append_zeros], 1) y_presoftmax = y_presoftmax + sysconf + sysreq if use_delex_features: y_presoftmax = y_presoftmax + utterance_representations_delex # value-specific decoder: if value_specific_decoder and False: h_utterance_representation_for_full_softmax = define_CNN_model( utterance_representations_full, num_filters, vector_dimension, longest_utterance_length) h_utterance_dropout = tf.nn.dropout( h_utterance_representation_for_full_softmax, keep_prob) ss_w_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) ss_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) ss_hidden_layer_1 = tf.nn.relu( tf.matmul(h_utterance_dropout, ss_w_hidden_layer) + ss_b_hidden_layer) ss_hidden_layer_1_with_dropout = tf.nn.dropout(ss_hidden_layer_1, keep_prob) ss_w_softmax = tf.Variable( tf.random_normal([hidden_units_1, label_size])) ss_b_softmax = tf.Variable(tf.zeros([label_size])) ss_contribution = tf.matmul(ss_hidden_layer_1_with_dropout, ss_w_softmax) + ss_b_softmax y_presoftmax += ss_contribution # as we are returning always, can't be null update_coefficient = tf.constant(0.49) if use_softmax: if False: #rule based #if learn_belief_state_update: if value_specific_decoder: # value-specific update update_coefficient = tf.constant(0.8) ss_W_memory = tf.Variable( tf.random_normal([label_size, label_size])) ss_W_current = tf.Variable( tf.random_normal([label_size, label_size])) y_combine = tf.matmul(y_past_state, ss_W_memory) + tf.matmul( y_presoftmax, ss_W_current) else: #constrained update_coefficient = tf.constant(0.7) a_memory = tf.Variable(tf.random_normal([1, 1])) diag_memory = a_memory * tf.diag(tf.ones(label_size)) b_memory = tf.Variable(tf.random_normal([1, 1])) non_diag_memory = tf.matrix_set_diag( b_memory * tf.ones([label_size, label_size]), tf.zeros(label_size)) W_memory = diag_memory + non_diag_memory a_current = tf.Variable(tf.random_normal([1, 1])) diag_current = a_current * tf.diag(tf.ones(label_size)) b_current = tf.Variable(tf.random_normal([1, 1])) non_diag_current = tf.matrix_set_diag( b_current * tf.ones([label_size, label_size]), tf.zeros(label_size)) W_current = diag_current + non_diag_current y_combine = tf.matmul(y_past_state, W_memory) + tf.matmul( y_presoftmax, W_current ) #+ tf.matmul(sysreq, W_current_req) + tf.matmul(sysconf, W_current_conf) y = tf.nn.softmax(y_combine) # + y_ss_update_contrib) else: # This code runs the baseline experiments reported in Footnote 2 in the paper. update_coefficient = tf.Variable( 0.5) #this scales the contribution of the current turn. y_combine = update_coefficient * y_presoftmax + ( 1 - update_coefficient) * y_past_state y = tf.nn.softmax(y_combine) else: y = tf.nn.sigmoid( y_presoftmax ) # for requestables, we just have turn-level binary decisions # ======================== LOSS IS JUST CROSS ENTROPY ========================================== if use_softmax: cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=y_combine, labels=y_) else: cross_entropy = tf.reduce_sum(tf.square(y - y_)) # ============================= EVALUATION ===================================================== if use_softmax: predictions = tf.cast(tf.argmax(y, 1), "float32") # will have ones where positive true_predictions = tf.cast(tf.argmax(y_, 1), "float32") correct_prediction = tf.cast(tf.equal(predictions, true_predictions), "float") accuracy = tf.reduce_mean(correct_prediction) # this will count number of positives - they are marked with 1 in true_predictions num_positives = tf.reduce_sum(true_predictions) # positives are indicated with ones. classified_positives = tf.reduce_sum(predictions) # will have ones in all places where both are predicting positives true_positives = tf.multiply(predictions, true_predictions) # if indicators for positive of both are 1, then it is positive. num_true_positives = tf.reduce_sum(true_positives) recall = num_true_positives / num_positives precision = num_true_positives / classified_positives f_score = (2 * recall * precision) / (recall + precision) else: predictions = tf.cast(tf.round(y), "float32") # will have ones where positive true_predictions = tf.cast(tf.round(y_), "float32") correct_prediction = tf.cast(tf.equal(predictions, true_predictions), "float") num_positives = tf.reduce_sum(true_predictions) classified_positives = tf.reduce_sum(predictions) true_positives = tf.multiply(predictions, true_predictions) num_true_positives = tf.reduce_sum(true_positives) recall = num_true_positives / num_positives precision = num_true_positives / classified_positives f_score = (2 * recall * precision) / (recall + precision) accuracy = tf.reduce_mean(correct_prediction) optimizer = tf.train.AdamOptimizer(0.001) train_step = optimizer.minimize(cross_entropy) return keep_prob, utterance_representations_full, utterance_representations_delex, \ system_act_slots, system_act_confirm_slots, system_act_confirm_values, \ y_, y_past_state, accuracy, f_score, precision, \ recall, num_true_positives, num_positives, classified_positives, y, \ predictions, true_predictions, correct_prediction, true_positives, train_step, update_coefficient
def iwae(p_z, p_x_given_z, q_z, observations, num_samples, cvs, contexts=None, antithetic=False): """Computes a gradient of the IWAE estimator. Args: p_z: The prior. Should be a callable that optionally accepts a conditioning context and returns a tfp.distributions.Distribution which has the log_prob and sample methods implemented. The distribution should be over a [batch_size, latent_dim] space. p_x_given_z: The likelihood. Should be a callable that accepts as input a tensor of shape [num_samples, batch_size, latent_size + context_size] and returns a tfd.Distribution over a [num_samples, batch_size, data_dim] space. q_z: The proposal, should be a callable which accepts a batch of observations of shape [batch_size, data_dim] and returns a distribution over [batch_size, latent_dim]. observations: A float Tensor of shape [batch_size, data_dim] containing the observations. num_samples: The number of samples for the IWAE estimator. cvs: Control variate variables. contexts: A float Tensor of shape [batch_size, context_dim] containing the contexts. (Optionally, none) antithetic: Whether to use antithetic sampling. Returns: estimators: Dictionary of tuples (objective, neg_model_loss, neg_inference_network_loss). """ alpha, beta, gamma, delta = cvs batch_size = tf.shape(observations)[0] proposal = q_z(observations, contexts, stop_gradient=False) # [num_samples, batch_size, latent_size] # If antithetic sampling, draw half of the samples and use the antithetics # for the other half. if antithetic: z_pos = proposal.sample(sample_shape=[num_samples // 2]) z_neg = 2 * proposal.loc - z_pos z = tf.concat((z_pos, z_neg), axis=0) else: z = proposal.sample(sample_shape=[num_samples]) tiled_contexts = None if contexts is not None: tiled_contexts = tf.tile(tf.expand_dims(contexts, 0), [num_samples, 1, 1]) likelihood = p_x_given_z(z, tiled_contexts) # Before reduce_sum is [num_samples, batch_size, latent_dim]. # Sum over the latent dim. log_q_z = tf.reduce_sum(proposal.log_prob(z), axis=-1) # Before reduce_sum is [num_samples, batch_size, latent_dim]. # Sum over latent dim. prior = p_z(contexts) log_p_z = tf.reduce_sum(prior.log_prob(z), axis=-1) # Before reduce_sum is [num_samples, batch_size, data_dim] log_p_x_given_z = tf.reduce_sum(likelihood.log_prob(observations), axis=-1) log_weights = log_p_z + log_p_x_given_z - log_q_z log_sum_weight = tf.reduce_logsumexp(log_weights, axis=0) log_avg_weight = log_sum_weight - tf.log(tf.to_float(num_samples)) normalized_weights = tf.stop_gradient(tf.nn.softmax(log_weights, axis=0)) if FLAGS.image_summary: best_index = tf.to_int32(tf.argmax(normalized_weights, axis=0)) indices = tf.stack((best_index, tf.range(0, batch_size)), axis=-1) best_images = tf.gather_nd(likelihood.probs_parameter(), indices) if FLAGS.dataset == "struct_mnist": tf.summary.image("bottom_half", tf.reshape(best_images, [batch_size, -1, 28, 1])) else: tf.summary.image("output", tf.reshape(best_images, [batch_size, -1, 28, 1])) tf.summary.image("input", tf.reshape(observations, [batch_size, -1, 28, 1])) # Compute gradient estimators model_loss = log_avg_weight estimators = {} estimators["iwae"] = (log_avg_weight, log_avg_weight, log_avg_weight) stopped_z_log_q_z = tf.reduce_sum(proposal.log_prob(tf.stop_gradient(z)), axis=-1) estimators["rws"] = (log_avg_weight, model_loss, tf.reduce_sum(normalized_weights * stopped_z_log_q_z, axis=0)) # Doubly reparameterized stopped_proposal = q_z(observations, contexts, stop_gradient=True) stopped_log_q_z = tf.reduce_sum(stopped_proposal.log_prob(z), axis=-1) stopped_log_weights = log_p_z + log_p_x_given_z - stopped_log_q_z sq_normalized_weights = tf.square(normalized_weights) estimators["stl"] = (log_avg_weight, model_loss, tf.reduce_sum(normalized_weights * stopped_log_weights, axis=0)) estimators["dreg"] = (log_avg_weight, model_loss, tf.reduce_sum(sq_normalized_weights * stopped_log_weights, axis=0)) estimators["rws-dreg"] = ( log_avg_weight, model_loss, tf.reduce_sum( (normalized_weights - sq_normalized_weights) * stopped_log_weights, axis=0)) # Add normed versions normalized_sq_normalized_weights = ( sq_normalized_weights / tf.reduce_sum(sq_normalized_weights, axis=0, keepdims=True)) estimators["dreg-norm"] = (log_avg_weight, model_loss, tf.reduce_sum(normalized_sq_normalized_weights * stopped_log_weights, axis=0)) rws_dregs_weights = normalized_weights - sq_normalized_weights normalized_rws_dregs_weights = rws_dregs_weights / tf.reduce_sum( rws_dregs_weights, axis=0, keepdims=True) estimators["rws-dreg-norm"] = (log_avg_weight, model_loss, tf.reduce_sum(normalized_rws_dregs_weights * stopped_log_weights, axis=0)) estimators["dreg-alpha"] = (log_avg_weight, model_loss, (1 - FLAGS.alpha) * estimators["dreg"][-1] + FLAGS.alpha * estimators["rws-dreg"][-1]) # Jackknife loo_log_weights = tf.tile(tf.expand_dims(tf.transpose(log_weights), -1), [1, 1, num_samples]) loo_log_weights = tf.matrix_set_diag( loo_log_weights, -np.inf * tf.ones([batch_size, num_samples])) loo_log_avg_weight = tf.reduce_mean( tf.reduce_logsumexp(loo_log_weights, axis=1) - tf.log(tf.to_float(num_samples - 1)), axis=-1) jk_model_loss = num_samples * log_avg_weight - (num_samples - 1) * loo_log_avg_weight estimators["jk"] = (jk_model_loss, jk_model_loss, jk_model_loss) # Compute JK w/ DReG for the inference network loo_normalized_weights = tf.reduce_mean(tf.square( tf.stop_gradient(tf.nn.softmax(loo_log_weights, axis=1))), axis=-1) estimators["jk-dreg"] = ( jk_model_loss, jk_model_loss, num_samples * tf.reduce_sum(sq_normalized_weights * stopped_log_weights, axis=0) - (num_samples - 1) * tf.reduce_sum( tf.transpose(loo_normalized_weights) * stopped_log_weights, axis=0) ) # Compute control variates loo_baseline = tf.expand_dims(tf.transpose(log_weights), -1) loo_baseline = tf.tile(loo_baseline, [1, 1, num_samples]) loo_baseline = tf.matrix_set_diag( loo_baseline, -np.inf * tf.ones_like(tf.transpose(log_weights))) loo_baseline = tf.reduce_logsumexp(loo_baseline, axis=1) loo_baseline = tf.transpose(loo_baseline) learning_signal = tf.stop_gradient(tf.expand_dims( log_avg_weight, 0)) - (1 - gamma) * tf.stop_gradient(loo_baseline) vimco = tf.reduce_sum(learning_signal * stopped_z_log_q_z, axis=0) first_part = alpha * vimco + (1 - alpha) * tf.reduce_sum( normalized_weights * stopped_log_weights, axis=0) second_part = ((1 - beta) * (tf.reduce_sum( ((1 - delta) / tf.to_float(num_samples) - normalized_weights) * stopped_z_log_q_z, axis=0)) + beta * tf.reduce_sum( (sq_normalized_weights - normalized_weights) * stopped_log_weights, axis=0)) estimators["dreg-cv"] = (log_avg_weight, model_loss, first_part + second_part) return estimators