def add_noise(data, noise, dataset): noise_type = noise['noise_type'] if noise_type in ['None', 'none', None]: return data if noise_type == 'data': noise_type = 'bitflip' if dataset['binary'] else 'masked_uniform' with tf.name_scope('input_noise'): shape = tf.stack([ s.value if s.value is not None else tf.shape(data)[i] for i, s in enumerate(data.get_shape()) ]) if noise_type == 'bitflip': noise_dist = dist.Bernoulli(probs=noise['prob'], dtype=data.dtype) n = noise_dist.sample(shape) corrupted = data + n - 2 * data * n # hacky way of implementing (data XOR n) elif noise_type == 'masked_uniform': noise_dist = dist.Uniform(low=0., high=1.) noise_uniform = noise_dist.sample(shape) # sample mask mask_dist = dist.Bernoulli(probs=noise['prob'], dtype=data.dtype) mask = mask_dist.sample(shape) # produce output corrupted = mask * noise_uniform + (1 - mask) * data else: raise KeyError('Unknown noise_type "{}"'.format(noise_type)) corrupted.set_shape(data.get_shape()) return corrupted
def __init__(self, n_hidden, steps_bias=0., max_rel_logit_change=np.inf, max_logit_change=np.inf, **kwargs): """ :param n_hidden: :param steps_bias: :param max_rel_logit_change: float; maximum relative logit change since the previous time-step :param kwargs: """ super(StepsPredictor, self).__init__() self._n_hidden = n_hidden self._steps_bias = steps_bias self._max_rel_logit_change = max_rel_logit_change self._bernoulli = lambda logits: tfd.Bernoulli(logits=logits, dtype=tf.float32, **kwargs) with self._enter_variable_scope(): if max_logit_change != np.inf and max_rel_logit_change != np.inf: raise ValueError('Only one of max_logit_change and max_rel_logit_change can be used!') if max_rel_logit_change != np.inf: max_rel_logit_change = tf.get_variable('max_rel_logit_change', shape=[], initializer=tf.constant_initializer(max_rel_logit_change), trainable=False) self._max_rel_logit_change = max_rel_logit_change if max_logit_change != np.inf: max_logit_change = tf.get_variable('max_logit_change', shape=[], initializer=tf.constant_initializer(max_logit_change), trainable=False) self._max_logit_change = max_logit_change
def output_function(self, state): params = dense_layer(state.h3, self.output_units, scope='gmm', reuse=tf.compat.v1.AUTO_REUSE) pis, mus, sigmas, rhos, es = self._parse_parameters(params) mu1, mu2 = tf.split(mus, 2, axis=1) mus = tf.stack([mu1, mu2], axis=2) sigma1, sigma2 = tf.split(sigmas, 2, axis=1) covar_matrix = [ tf.square(sigma1), rhos * sigma1 * sigma2, rhos * sigma1 * sigma2, tf.square(sigma2) ] covar_matrix = tf.stack(covar_matrix, axis=2) covar_matrix = tf.reshape( covar_matrix, (self.batch_size, self.num_output_mixture_components, 2, 2)) mvn = tfd.MultivariateNormalFullCovariance( loc=mus, covariance_matrix=covar_matrix) b = tfd.Bernoulli(probs=es) c = tfd.Categorical(probs=pis) sampled_e = b.sample() sampled_coords = mvn.sample() sampled_idx = c.sample() idx = tf.stack([tf.range(self.batch_size), sampled_idx], axis=1) coords = tf.gather_nd(sampled_coords, idx) return tf.concat([coords, tf.cast(sampled_e, tf.float32)], axis=1)
def construct_masked_inputs(self): """ Here. we should either define ALL the placeholders we'll ever need, or expect people to subclass. Subclassing is probably cleaner. Must set fields: self.mask The mask sample. self.network_input: The masked input self.remaining_input The part of the positive input that wasn't masked """ masker = ds.Bernoulli(probs=self.keep_prob_ph, dtype=tf.float32) mask_shape = [self.batch_size, self.input_dim] mask = masker.sample(sample_shape=mask_shape) reverse_mask = ( 1 - mask ) #Only leaves the things that aren't in the original input. network_input = (self.batch_of_users * mask) remaining_input = (self.batch_of_users * reverse_mask) number_of_good_items = tf.reduce_sum(self.batch_of_users, axis=-1) number_of_unseen_items = tf.reduce_sum(remaining_input, axis=-1) number_of_seen_items = tf.reduce_sum(network_input, axis=-1) self.mask = mask self.network_input = network_input self.remaining_input = remaining_input self.number_of_good_items = number_of_good_items self.number_of_unseen_items = number_of_unseen_items self.number_of_seen_items = number_of_seen_items
def decode(self,prev_state,prev_input,timestep): with tf.variable_scope("loop"): if timestep > 0: tf.get_variable_scope().reuse_variables() # Run the cell on a combination of the previous input and state output, state = self.cell(prev_input,prev_state) # mask before masked-scores position = tf.ones([prev_input.shape[0]]) * timestep position = tf.cast(position, tf.int32) # Update mask self.mask = tf.one_hot(position, self.seq_length) # Attention mechanism masked_scores = self.attention(self.encoder_output, output) # we cast to Bernoulli and sample prob = distr.Bernoulli(masked_scores) sampled_arr = prob.sample() # Batch_size, seqlenght for just one node self.samples.append(sampled_arr) self.mask_scores.append(masked_scores) if timestep == 0: self.first_city = position self.first_city_hot = tf.one_hot(self.first_city, self.seq_length) # Retrieve decoder's new input new_decoder_input = tf.gather(self.h,position)[0] return state, new_decoder_input
def make_decoder(z, x_shape=(1, 20, 1)): ''' Decoder: p(x|z) ''' net = make_nn(z, 20) logits = tf.reshape(net, tf.concat([[-1], x_shape], axis=0)) return tfd.Independent(tfd.Bernoulli(logits))
def _make_decoder(code, data_shape): with tf.variable_scope('decoder'): x = code x = tf.layers.dense(x, 200, tf.nn.relu) x = tf.layers.dense(x, 200, tf.nn.relu) logit = tf.layers.dense(x, _prod(data_shape)) logit = tf.reshape(logit, [-1] + data_shape) return tfd.Independent(tfd.Bernoulli(logit), 2)
def sample(self, n=None): if self._bernoulli is None: self._bernoulli = tfd.Bernoulli(self._steps_probs) sample = self._bernoulli.sample(n) sample = tf.cumprod(sample, tf.rank(sample) - 1) sample = tf.reduce_sum(sample, -1) return sample
def make_decoder(code, data_shape): x = code x = tf.layers.dense(x, hidden, tf.nn.relu) x = tf.layers.dense(x, hidden, tf.nn.relu) logit = tf.layers.dense(x, np.prod(data_shape)) logit = tf.reshape(logit, [-1] + data_shape) return tfd.Independent(tfd.Bernoulli(logit), 2)
def decode(self, encoder_output): # encoder_output is a tensor of size [batch_size, max_length, input_embed] with tf.variable_scope('singe_layer_nn'): W_l = tf.get_variable('weights_left', [self.input_embed, self.decoder_hidden_dim], initializer=self.initializer) W_r = tf.get_variable('weights_right', [self.input_embed, self.decoder_hidden_dim], initializer=self.initializer) U = tf.get_variable('U', [self.decoder_hidden_dim], initializer=self.initializer) # Aggregate across decoder hidden dim dot_l = tf.einsum('ijk, kl->ijl', encoder_output, W_l)#BTBT 把encoder出来的output[batch_siz,var_siz,encode_hidden_dim]转成[batch_siz,var_siz,decode_hidden_dim] dot_r = tf.einsum('ijk, kl->ijl', encoder_output, W_r) exp_l = tf.expand_dims(dot_l, axis=2) #BTBT [batch_siz,var_siz,1,decoder_hid] expand_dim中axis参数的意思是再那个维度插入(扩展)一维 exp_r = tf.expand_dims(dot_r, axis=1) #BTBT [batch_siz,1,var_siz,decoder_hid] tiled_l = tf.tile(exp_l, (1, 1, self.max_length, 1)) tiled_r = tf.tile(exp_r, (1, self.max_length, 1, 1)) if self.decoder_activation == 'tanh': # Original implementation by paper final_sum = tf.nn.tanh(tiled_l + tiled_r) elif self.decoder_activation == 'relu': final_sum = tf.nn.relu(tiled_l + tiled_r) elif self.decoder_activation == 'none': # Without activation function final_sum = tiled_l + tiled_r else: raise NotImplementedError('Current decoder activation is not implemented yet') # final_sum is of shape (batch_size, max_length, max_length, decoder_hidden_dim) #BTBT [batch_siz,var_siz,var_siz,decoder_hid] logits = tf.einsum('ijkl, l->ijk', final_sum, U) # Readability if self.bias_initial_value is None: # Randomly initialize the learnable bias self.logit_bias = tf.get_variable('logit_bias', [1]) elif self.use_bias_constant: # Constant bias self.logit_bias = tf.constant([self.bias_initial_value], tf.float32, name='logit_bias') else: # Learnable bias with initial value if self.use_bias: #BTBT [BUGFIX] 使用bias时才初始化它 self.logit_bias = tf.Variable([self.bias_initial_value], tf.float32, name='logit_bias') if self.use_bias: # Bias to control sparsity/density logits += self.logit_bias self.adj_prob = logits for i in range(self.max_length): position = tf.ones([encoder_output.shape[0]]) * i position = tf.cast(position, tf.int32) # Update mask self.mask = tf.one_hot(position, self.max_length) masked_score = self.adj_prob[:,i,:] - 100000000.*self.mask #BTBT avoid self-loop prob = distr.Bernoulli(masked_score) # probs input probability, logit input log_probability sampled_arr = prob.sample() # Batch_size, seqlenght for just one node self.samples.append(sampled_arr) self.mask_scores.append(masked_score) self.entropy.append(prob.entropy()) return self.samples, self.mask_scores, self.entropy
def make_decoder(z, x_shape=(x_dim,)): ''' Decoder: p(x|z) ''' with tf.variable_scope("decoder"): net = make_nn(z, x_dim) print('decoder net', net) logits = tf.reshape(net, tf.concat([[nb_z_samples, -1], x_shape], axis=0)) # For the batch print('logits', logits) return tfd.Independent(tfd.Bernoulli(logits), reinterpreted_batch_ndims=1)
def generative_model(observations, samples, is_training, latent_layer_dims, nn_layers): samples = list(reversed(samples)) latent_layer_dims = list(reversed(latent_layer_dims)) mu, sigma_sq = generator_net(samples[0], is_training, nn_layers[0], nn_layers[1], latent_layer_dims[0], 'gaussian') mean_list = [mu] var_list = [sigma_sq] p_lls = [] p_gen = None # reconstruction of training samples for i in range(1, len(samples) - 1): mu, sigma_sq = generator_net(samples[i], is_training, nn_layers[0], nn_layers[1], latent_layer_dims[i], 'gaussian') p_lls.append(dist.MultivariateNormalDiag(mu, sigma_sq)) mean_list.append(mu) var_list.append(sigma_sq) probs = generator_net(samples[-1], is_training, nn_layers[0], nn_layers[1], observations.get_shape().as_list()[1], likelihood='bernoulli') p_x = bernoulli_log_likelihood(observations, probs) # generation of novel samples sample_gen = tf.random_uniform([16], maxval=11, dtype=tf.int32) sample_gen = tf.one_hot(sample_gen, 10) mu_gen, sigma_sq_gen = generator_net(sample_gen, is_training, nn_layers[0], nn_layers[1], latent_layer_dims[0], 'gaussian') gen_samples = [dist.MultivariateNormalDiag(mu_gen, sigma_sq_gen).sample()] for i in range(1, len(latent_layer_dims) - 1): mu, sigma_sq = generator_net(samples[i], is_training, nn_layers[0], nn_layers[1], latent_layer_dims[i], 'gaussian') gen_samples.append(dist.MultivariateNormalDiag(mu, sigma_sq).sample()) probs = generator_net(gen_samples[-1], is_training, nn_layers[0], nn_layers[1], observations.get_shape().as_list()[1], likelihood='bernoulli') p_gen = dist.Bernoulli(probs=probs).sample() return probs, p_gen, p_x, mean_list, var_list
def _build(self, timestep, previous_presence, *_): is_first = tf.cast(tf.equal(timestep, 0), tf.float32) if self.discovery: logits = 88. * is_first + (1 - is_first) * -88. else: logits = -88. * is_first + (1 - is_first) * 88. logits = logits * tf.ones_like(previous_presence) return tfd.Bernoulli(logits=logits, dtype=tf.float32)
def bernoulli_generative_network(z, hidden_units, n_features): with slim.arg_scope([slim.fully_connected], activation_fn=tf.nn.relu): net = slim.stack(z, slim.fully_connected, hidden_units, scope='decoder_network') bernoulli_logits = slim.fully_connected(net, n_features, activation_fn=None) return distributions.Bernoulli(logits=bernoulli_logits)
def _build(self, inputs, hvar_labels, n_samples=10, analytic_kl=True): datum_shape = inputs.get_shape().as_list()[1:] enc_repr = self._encoder(inputs) self.hvar_prior = tfd.ExpRelaxedOneHotCategorical( temperature=self._temperature, logits=hvar_labels) self.hvar_posterior = tfd.ExpRelaxedOneHotCategorical( temperature=self._temperature, logits=self._hvar(enc_repr)) hvar_sample_shape = [n_samples ] + self.hvar_posterior.batch_shape.as_list( ) + self.hvar_posterior.event_shape.as_list() hvar_sample = tf.reshape(self.hvar_posterior.sample(n_samples), hvar_sample_shape) self.latent_posterior = self._latent_posterior_fn( self._loc(enc_repr), self._scale(enc_repr)) latent_posterior_sample = self.latent_posterior.sample(n_samples) joint_sample = tf.concat([hvar_sample, latent_posterior_sample], axis=-1) sample_decoder = snt.BatchApply(self._decoder) self.output_distribution = tfd.Independent( tfd.Bernoulli(logits=sample_decoder(joint_sample)), reinterpreted_batch_ndims=len(datum_shape)) distortion = -self.output_distribution.log_prob(inputs) if analytic_kl and n_samples == 1: rate = tfd.kl_divergence(self.latent_posterior, self.latent_prior) else: rate = (self.latent_posterior.log_prob(latent_posterior_sample) - self.latent_prior.log_prob(latent_posterior_sample)) hrate = self.hvar_posterior.log_prob( hvar_sample) - self.hvar_prior.log_prob(hvar_sample) # hrate = tf.Print(hrate, [temperature]) # hrate = tf.Print(hrate, [hvar_sample], summarize=10) # hrate = tf.Print(hrate, [self.hvar_posterior.log_prob(hvar_sample)]) # hrate = tf.Print(hrate, [self.hvar_prior.log_prob(hvar_sample)]) # hrate = tf.Print(hrate, [hrate], summarize=10) elbo_local = -(rate + hrate + distortion) self.elbo = tf.reduce_mean(elbo_local) self.importance_weighted_elbo = tf.reduce_mean( tf.reduce_logsumexp(elbo_local, axis=0) - tf.log(tf.to_float(n_samples))) self.hvar_sample = tf.exp(tf.split(hvar_sample, n_samples)[0]) self.hvar_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=hvar_labels, logits=tf.split(hvar_sample, n_samples)[0]) self.hvar_labels = hvar_labels self.distortion = distortion self.rate = rate self.hrate = hrate
def __init__(self, feature_ids, probs=None): super().__init__(feature_ids) if probs is None: self.probs = tf.random_uniform([len(feature_ids)], minval=0, maxval=1, dtype=spn_type) else: self.probs = tf.constant(probs, dtype=spn_type) self.probs = tf.Variable(tf.log(self.probs), trainable=True, dtype=spn_type) self.distributions = dist.Bernoulli(logits=self.probs)
def __init__(self, region, args, name, given_params=None, num_dims=0): super().__init__(name) self.local_size = len(region) self.args = args self.scope = sorted(list(region)) self.size = args.num_gauss self.num_dims = num_dims self.np_params = None self.params = self.args.param_provider.grab_leaf_parameters( self.scope, args.num_gauss, name=name + "_p") self.dist = dists.Bernoulli(logits=self.params)
def __init__(self, region, args, name, given_params=None, p=-0.7): super().__init__(name) self.local_size = len(region) self.args = args self.scope = sorted(list(region)) self.size = args.num_univ_distros self.probs = bernoulli_variable_with_weight_decay( name + "_bernoulli_params", shape=[1, self.local_size, self.size], wd=args.gauss_param_l2, p=p, values=given_params, ) self.dist = dists.Bernoulli(logits=self.probs)
def decode(self, encoder_output): # encoder_output is a tensor of size [batch_size, max_length, input_embed] with tf.variable_scope('bilinear'): W = tf.get_variable('bilinear_weights', [self.input_embed, self.input_embed], initializer=self.initializer) logits = tf.einsum('ijk, kn, imn->ijm', encoder_output, W, encoder_output) # Readability if self.bias_initial_value is None: # Randomly initialize the learnable bias self.logit_bias = tf.get_variable('logit_bias', [1]) elif self.use_bias_constant: # Constant bias self.logit_bias = tf.constant([self.bias_initial_value], tf.float32, name='logit_bias') else: # Learnable bias with initial value self.logit_bias = tf.Variable([self.bias_initial_value], tf.float32, name='logit_bias') if self.use_bias: # Bias to control sparsity/density logits += self.logit_bias self.adj_prob = logits for i in range(self.max_length): position = tf.ones([encoder_output.shape[0]]) * i position = tf.cast(position, tf.int32) # Update mask self.mask = tf.one_hot(position, self.max_length) masked_score = self.adj_prob[:, i, :] - 100000000. * self.mask prob = distr.Bernoulli( masked_score ) # probs input probability, logit input log_probability sampled_arr = prob.sample( ) # Batch_size, seqlenght for just one node self.samples.append(sampled_arr) self.mask_scores.append(masked_score) self.entropy.append(prob.entropy()) return self.samples, self.mask_scores, self.entropy
def _build_model(self): # input points self.x = tf.placeholder(tf.float32, shape=[None, int(np.prod(self.x_dims))], name="X") self.noise = tf.placeholder(tf.float32, shape=[None, self.z_dim], name="noise") self.p_z = dbns.Normal(loc=tf.zeros_like(self.noise), scale=tf.ones_like(self.noise)) # encoder z_params = self.encoder(self.x) z_mu = z_params[:, self.z_dim:] z_sigma = tf.exp(z_params[:, :self.z_dim]) self.q_z = dbns.Normal(loc=z_mu, scale=z_sigma) # reparameterization trick z = z_mu + tf.multiply(z_sigma, self.p_z.sample()) # z = self.q_z.sample() # decoder self.x_hat = self.decoder(z) self.p_x_z = dbns.Bernoulli(logits=self.x_hat) nll_loss = -tf.reduce_sum(self.x * tf.log(1e-8 + self.x_hat) + (1 - self.x) * tf.log(1e-8 + 1 - self.x_hat), 1) # Bernoulli nll kl_loss = 0.5 * tf.reduce_sum(tf.square(z_mu) + tf.square(z_sigma) - tf.log(1e-8 + tf.square(z_sigma)) - 1, 1) # kl_loss = tf.reduce_sum(dbns.kl_divergence(self.q_z, self.p_z), 1) self.loss = tf.reduce_mean(nll_loss + kl_loss) self.elbo = -1.0 * tf.reduce_mean(nll_loss + kl_loss) # in original paper, lr chosen from {0.01, 0.02, 0.1} depending on first few iters training performance optimizer = tf.train.AdagradOptimizer(learning_rate=self.lr) self.train_op = optimizer.minimize(self.loss) # for sampling self.z = self.encoder(self.x, trainable=False, reuse=True) self.z_pl = tf.placeholder(tf.float32, shape=[None, self.z_dim]) self.sample = self.decoder(self.z_pl, trainable=False, reuse=True) # tensorboard summaries x_img = tf.reshape(self.x, [-1] + self.x_dims) tf.summary.image('data', x_img) xhat_img = tf.reshape(self.x_hat, [-1] + self.x_dims) tf.summary.image('reconstruction', xhat_img) tf.summary.scalar('reconstruction_loss', tf.reduce_mean(nll_loss)) tf.summary.scalar('kl_loss', tf.reduce_mean(kl_loss)) tf.summary.scalar('loss', self.loss) tf.summary.scalar('elbo', self.elbo) self.merged = tf.summary.merge_all()
def _create_dropout_mask(self, keep_prob, shape, log=True, name="DropoutMask"): """Creates a dropout mask with values drawn from a Bernoulli distribution with parameter ``keep_prob``. Args: keep_prob (Tensor): A float ``Tensor`` indicating the probability of keeping an element active. shape (Tensor): A 1D ``Tensor`` specifying the shape of the """ with tf.name_scope(name): mask = tfd.Bernoulli(probs=keep_prob, dtype=conf.dtype, name="DropoutMaskBernoulli")\ .sample(sample_shape=shape) return tf.log(mask) if log else mask
def add_noise(data, noise): noise_type = noise['noise_type'] if noise_type in ['None', 'none', None]: return data with tf.name_scope('input_noise'): shape = tf.stack([ s.value if s.value is not None else tf.shape(data)[i] for i, s in enumerate(data.get_shape()) ]) if noise_type == 'bitflip': noise_dist = dist.Bernoulli(probs=noise['prob'], dtype=data.dtype) n = noise_dist.sample(shape) corrupted = data + n - 2 * data * n # hacky way of implementing (data XOR n) else: raise KeyError('Unknown noise_type "{}"'.format(noise_type)) corrupted.set_shape(data.get_shape()) return corrupted
def construct_masked_inputs(self): masker = ds.Bernoulli(probs=self.keep_prob_ph, dtype=tf.float32) mask_shape = [self.batch_size, self.input_dim] mask = masker.sample(sample_shape=mask_shape) reverse_mask = ( 1 - mask ) #Only leaves the things that aren't in the original input. network_input = (self.batch_of_users[:, :self.input_dim] * mask) remaining_input = (self.batch_of_users[:, :self.input_dim] * reverse_mask) number_of_good_items = tf.reduce_sum( self.batch_of_users[:, :self.input_dim], axis=-1) number_of_unseen_items = tf.reduce_sum(remaining_input, axis=-1) number_of_seen_items = tf.reduce_sum(network_input, axis=-1) self.mask = mask self.network_input = tf.concat( [network_input, self.batch_of_users[:, self.input_dim:]], 1) # masked input (input for actors) self.remaining_input = remaining_input # reverse masked input self.number_of_good_items = number_of_good_items # feature H0 self.number_of_unseen_items = number_of_unseen_items # feature H1 self.number_of_seen_items = number_of_seen_items
def _make_step_posterior(self, presence_prob, presence_logit): # pylint disable=unused-variable return tfd.Bernoulli(logits=tf.squeeze(presence_logit, -1))
def _test(self, probs, n): rv = Bernoulli(probs) dist = ds.Bernoulli(probs) self.assertEqual(rv.sample(n).shape, dist.sample(n).shape)
def decode(self, encoder_output): # encoder_output is a tensor of size [batch_size, max_length, input_embed] with tf.variable_scope('ntn'): W = tf.get_variable( 'bilinear_weights', [self.input_embed, self.input_embed, self.decoder_hidden_dim], initializer=self.initializer) W_l = tf.get_variable('weights_left', [self.input_embed, self.decoder_hidden_dim], initializer=self.initializer) W_r = tf.get_variable('weights_right', [self.input_embed, self.decoder_hidden_dim], initializer=self.initializer) U = tf.get_variable('U', [self.decoder_hidden_dim], initializer=self.initializer) B = tf.get_variable('bias', [self.decoder_hidden_dim], initializer=self.initializer) # Compute linear output with shape (batch_size, max_length, max_length, decoder_hidden_dim) dot_l = tf.einsum('ijk, kl->ijl', encoder_output, W_l) dot_r = tf.einsum('ijk, kl->ijl', encoder_output, W_r) tiled_l = tf.tile(tf.expand_dims(dot_l, axis=2), (1, 1, self.max_length, 1)) tiled_r = tf.tile(tf.expand_dims(dot_r, axis=1), (1, self.max_length, 1, 1)) linear_sum = tiled_l + tiled_r # Compute bilinear product with shape (batch_size, max_length, max_length, decoder_hidden_dim) bilinear_product = tf.einsum('ijk, knl, imn->ijml', encoder_output, W, encoder_output) if self.decoder_activation == 'tanh': # Original implementation by paper final_sum = tf.nn.tanh(bilinear_product + linear_sum + B) elif self.decoder_activation == 'relu': final_sum = tf.nn.relu(bilinear_product + linear_sum + B) elif self.decoder_activation == 'none': # Without activation function final_sum = bilinear_product + linear_sum + B else: raise NotImplementedError( 'Current decoder activation is not implemented yet') logits = tf.einsum('ijkl, l->ijk', final_sum, U) # Readability if self.bias_initial_value is None: # Randomly initialize the learnable bias self.logit_bias = tf.get_variable('logit_bias', [1]) elif self.use_bias_constant: # Constant bias self.logit_bias = tf.constant([self.bias_initial_value], tf.float32, name='logit_bias') else: # Learnable bias with initial value self.logit_bias = tf.Variable([self.bias_initial_value], tf.float32, name='logit_bias') if self.use_bias: # Bias to control sparsity/density logits += self.logit_bias self.adj_prob = logits for i in range(self.max_length): position = tf.ones([encoder_output.shape[0]]) * i position = tf.cast(position, tf.int32) # Update mask self.mask = tf.one_hot(position, self.max_length) masked_score = self.adj_prob[:, i, :] - 100000000. * self.mask prob = distr.Bernoulli( masked_score ) # probs input probability, logit input log_probability sampled_arr = prob.sample( ) # Batch_size, seqlenght for just one node self.samples.append(sampled_arr) self.mask_scores.append(masked_score) self.entropy.append(prob.entropy()) return self.samples, self.mask_scores, self.entropy
def _test(self, probs, n): rv = Bernoulli(probs) dist = ds.Bernoulli(probs) x = rv.sample(n).eval() self.assertAllEqual(rv.log_prob(x).eval(), dist.log_prob(x).eval())
def _create_loss_optimizer(self): all_variables = dict() all_variables['tao']=tf.Variable(temperature,name="temperature") logits_theta=self._encoder_network_theta() self.q_theta = tf.nn.softmax(logits_theta) log_q_theta = tf.log(self.q_theta+1e-20) self.theta = gumbel_softmax(logits_theta,all_variables['tao']) logits_c_1=self._encoder_network_c() logits_c=tf.reshape(logits_c_1,[-1,n_reliability_d1,n_reliability_d2]) eps=tf.random_normal((n_agents,n_reliability_d1,n_reliability_d2),0,1,dtype=tf.float32) self.c=logits_c+tf.multiply(eps,variance_q_c_given_M) self.c_flatten=tf.reshape(self.c,[-1,n_reliability]) logits_M_1 = self._decoder_network() logits_M= logits_M_1 decay_theta = tf.Variable(1, trainable=False, dtype=tf.float32) self.decay_theta_op = decay_theta.assign(decay_theta*0.9 ) mean_p_prior_c=tf.tensordot(self.var_Variational_s,self.var_Variational_ctilder_mean,[1,0]) p_M = ds.Bernoulli(logits=logits_M) kl_theta_tmp = self.q_theta*(log_q_theta-tf.log(self.MV_distribution+1e-20)) KL_theta = tf.reduce_sum(kl_theta_tmp) kl_c_tmp = 0.5*(-1-tf.log(variance_q_c_given_M)+2*tf.square(self.c-mean_p_prior_c)/variance_p_c_given_ctilder+variance_q_c_given_M/variance_p_c_given_ctilder)#- p_prior_log_c.log_prob(self.log_c) KL_c = tf.reduce_sum(kl_c_tmp) elbo=tf.reduce_sum(tf.multiply(p_M.log_prob(self.M_onehot),M_nan_or_not)) - decay_theta* KL_theta- KL_c self.cost=-elbo l2_loss=tf.losses.get_regularization_loss() self.cost=self.cost+l2_loss # Use ADAM optimizer self.optimizer = \ tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
def network_train(): with tf.variable_scope('data'): x = tf.placeholder(tf.float32, [None, 28, 28, 1]) with tf.name_scope('variational'): q_mu, q_sigma = Encoder(x, latent_dim=FLAGS.latent_dim, hidden_size=FLAGS.hidden_size) q_z = distributions.Normal(loc=q_mu, scale=q_sigma) assert q_z.reparameterization_type == distributions.FULLY_REPARAMETERIZED with tf.variable_scope('model'): p_xIz_logits = Decoder(q_z.sample(), hidden_size=FLAGS.hidden_size) p_xIz = distributions.Bernoulli(logits=p_xIz_logits) posterior_predictive_samples = p_xIz.sample() with tf.variable_scope('model', reuse=True): p_z = distributions.Normal(loc=np.zeros(FLAGS.latent_dim, dtype=np.float32), scale=np.ones(FLAGS.latent_dim, dtype=np.float32)) p_z_sample = p_z.sample(FLAGS.n_samples) p_xIz_logits = Decoder(p_z_sample, hidden_size=FLAGS.hidden_size) prior_predictive = distributions.Bernoulli(logits=p_xIz_logits) prior_predictive_samples = prior_predictive.sample() with tf.variable_scope('model', reuse=True): z_input = tf.placeholder(tf.float32, [None, FLAGS.latent_dim]) p_xIz_logits = Decoder(z_input, hidden_size=FLAGS.hidden_size) prior_predictive_inp = distributions.Bernoulli(logits=p_xIz_logits) prior_predictive_inp_sample = prior_predictive_inp.sample() kl = tf.reduce_sum(distributions.kl(q_z, p_z), 1) e_log_likelihood = tf.reduce_sum(p_xIz.log_prob(x), [1, 2, 3]) elbo = tf.reduce_sum(e_log_likelihood - kl, 0) optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(-elbo) init_op = tf.global_variables_initializer() sess = tf.InteractiveSession() sess.run(init_op) mnist = read_data_sets(FLAGS.data_dir) print('Saving images to: %s' % FLAGS.fig_dir) plot_elbo = [] for i in range(FLAGS.n_episodes): batch_x, _ = mnist.train.next_batch(FLAGS.batch_size) batch_x = batch_x.reshape(FLAGS.batch_size, 28, 28, 1) batch_x = (batch_x > 0.5).astype(np.float32) sess.run(optimizer, {x: batch_x}) batch_elbo = sess.run(elbo, {x: batch_x}) plot_elbo.append(batch_elbo / float(FLAGS.batch_size)) if i % 1000 == 0: batch_elbo = sess.run(elbo, {x: batch_x}) print('Episode: {0:d} ELBO: {1: .3f}'.format( i, batch_elbo / FLAGS.batch_size)) batch_posterior_predictive_samples, batch_prior_predictive_samples = sess.run( [posterior_predictive_samples, prior_predictive_samples], {x: batch_x}) for k in range(FLAGS.n_samples): f_name = os.path.join(FLAGS.fig_dir, 'episode_%d_data_%d.jpg' % (i, k)) imsave(f_name, batch_x[k, :, :, 0]) f_name = os.path.join(FLAGS.fig_dir, 'episode_%d_posterior_%d.jpg' % (i, k)) imsave(f_name, batch_posterior_predictive_samples[k, :, :, 0]) f_name = os.path.join(FLAGS.fig_dir, 'episode_%d_prior_%d.jpg' % (i, k)) imsave(f_name, batch_prior_predictive_samples[k, :, :, 0]) plt.plot(range(len(plot_elbo)), plot_elbo) plt.show()
def p_y_xz(self, x, z_stacked, TD, mode): # x is [bs/nbs, 2*enc_rnn_dim] # z_stacked is [k, bs/nbs, N*K] (at EVAL or PREDICT time, k (=self.sample_ct) may be hps.k, K**N or sample_ct) # in this function, rnn decoder inputs are of the form: z + x + car1 + car2 (note: first 3 are "extras" to help with learning) ph = self.hps.prediction_horizon k, GMM_c, pred_dim = self.sample_ct, self.hps.GMM_components, self.pred_dim with tf.variable_scope("p_y_xz") as varscope: z = tf.reshape(z_stacked, [-1, self.latent.z_dim]) # [k;bs/nbs, z_dim] zx = tf.concat([z, tf.tile(x, [k, 1])], axis=1) # [k;bs/nbs, z_dim + 2*enc_rnn_dim] cell = stacked_rnn_cell(self.hps.rnn_cell, self.hps.rnn_cell_kwargs, self.hps.dec_rnn_dim, self.hps.rnn_io_dropout_keep_prob, mode) initial_state = project_to_RNN_initial_state(cell, zx) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if self.hps.sample_model_during_dec and mode == tf.estimator.ModeKeys.TRAIN: input_ = tf.concat( [zx, tf.tile(TD["joint_present"], [k, 1])], axis=1 ) # [k;bs, N*K + 2*enc_rnn_dim + pred_dim+state_dim] state = initial_state with tf.variable_scope("rnn") as rnnscope: log_pis, mus, log_sigmas, corrs = [], [], [], [] for j in range(ph): if j > 0: rnnscope.reuse_variables() output, state = cell(input_, state) log_pi_t, mu_t, log_sigma_t, corr_t = project_to_GMM_params( output, GMM_c, pred_dim, self.hps.dec_GMM_proj_MLP_dims) y_t = GMM2D(log_pi_t, mu_t, log_sigma_t, corr_t, self.hps.log_sigma_min, self.hps.log_sigma_max).sample( ) # [k;bs, pred_dim] mask = distributions.Bernoulli( probs=self.dec_sample_model_prob, dtype=tf.float32).sample( (tf.shape(y_t)[0], 1)) # maybe tf.shape y_t = mask * y_t + (1 - mask) * tf.tile( TD["car2_future"][:, j, :], [k, 1]) log_pis.append(log_pi_t) mus.append(mu_t) log_sigmas.append(log_sigma_t) corrs.append(corr_t) car_inputs = tf.concat( [ tf.tile(TD["car1_future"][:, j, :], [k, 1]), y_t ], axis=1) # [k;bs, state_dim + pred_dim] input_ = tf.concat( [zx, car_inputs], axis=1 ) # [k;bs, N*K + 2*enc_rnn_dim + state_dim + pred_dim] log_pis = tf.stack(log_pis, axis=1) # [k;bs, ph, GMM_c] mus = tf.stack(mus, axis=1) # [k;bs, ph, GMM_c*pred_dim] log_sigmas = tf.stack( log_sigmas, axis=1) # [k;bs, ph, GMM_c*pred_dim] corrs = tf.stack(corrs, axis=1) # [k;bs, ph, GMM_c] else: zx_with_time_dim = tf.expand_dims( zx, 1) # [k;bs/nbs, 1, N*K + 2*enc_rnn_dim] zx_time_tiled = tf.tile( zx_with_time_dim, [1, ph, 1]) # [k;bs/nbs, ph, N*K + 2*enc_rnn_dim] car_inputs = tf.concat( [ # [bs/nbs, ph, 2*state_dim] tf.expand_dims( TD["joint_present"], 1), # [bs/nbs, 1, state_dim+pred_dim] tf.concat( [ TD["car1_future"][:, :ph - 1, :], TD["car2_future"][:, :ph - 1, :] ], axis=2) # [bs/nbs, ph-1, state_dim+pred_dim] ], axis=1) inputs = tf.concat( [zx_time_tiled, tf.tile(car_inputs, [k, 1, 1])], axis=2 ) # [k;bs/nbs, ph, N*K + 2*enc_rnn_dim + pred_dim + state_dim] outputs, _ = tf.nn.dynamic_rnn( cell, inputs, initial_state= initial_state, # [k;bs/nbs, ph, dec_rnn_dim] time_major=False, dtype=tf.float32, scope="rnn") with tf.variable_scope( "rnn"): # required to match PREDICT mode below log_pis, mus, log_sigmas, corrs = project_to_GMM_params( outputs, GMM_c, pred_dim, self.hps.dec_GMM_proj_MLP_dims) tf.summary.histogram("GMM_log_pis", log_pis) tf.summary.histogram("GMM_log_sigmas", log_sigmas) tf.summary.histogram("GMM_corrs", corrs) elif mode == tf.estimator.ModeKeys.PREDICT: input_ = tf.concat( [zx, tf.tile(TD["joint_present"], [k, 1])], axis=1) # [k;bs, N*K + 2*enc_rnn_dim + pred_dim+state_dim] state = initial_state with tf.variable_scope("rnn") as rnnscope: log_pis, mus, log_sigmas, corrs, y = [], [], [], [], [] for j in range(ph): if j > 0: rnnscope.reuse_variables() output, state = cell(input_, state) log_pi_t, mu_t, log_sigma_t, corr_t = project_to_GMM_params( output, GMM_c, pred_dim, self.hps.dec_GMM_proj_MLP_dims) y_t = GMM2D(log_pi_t, mu_t, log_sigma_t, corr_t, self.hps.log_sigma_min, self.hps. log_sigma_max).sample() # [k;bs, pred_dim] log_pis.append(log_pi_t) mus.append(mu_t) log_sigmas.append(log_sigma_t) corrs.append(corr_t) y.append(y_t) car_inputs = tf.concat( [tf.tile(TD["car1_future"][:, j, :], [k, 1]), y_t], axis=1) # [k;bs, state_dim + pred_dim] input_ = tf.concat( [zx, car_inputs], axis=1 ) # [k;bs, N*K + 2*enc_rnn_dim + state_dim + pred_dim] log_pis = tf.stack(log_pis, axis=1) # [k;bs, ph, GMM_c] mus = tf.stack(mus, axis=1) # [k;bs, ph, GMM_c*pred_dim] log_sigmas = tf.stack(log_sigmas, axis=1) # [k;bs, ph, GMM_c*pred_dim] corrs = tf.stack(corrs, axis=1) # [k;bs, ph, GMM_c] car2_sampled_future = tf.reshape( tf.stack(y, axis=1), [k, -1, ph, pred_dim]) # [k, bs, ph, pred_dim] y_dist = GMM2D( tf.reshape(log_pis, [k, -1, ph, GMM_c]), tf.reshape(mus, [k, -1, ph, GMM_c * pred_dim]), tf.reshape(log_sigmas, [k, -1, ph, GMM_c * pred_dim]), tf.reshape(corrs, [k, -1, ph, GMM_c]), self.hps.log_sigma_min, self.hps.log_sigma_max) if mode == tf.estimator.ModeKeys.PREDICT: return y_dist, car2_sampled_future else: return y_dist