def testMakeValueSetterWorksWithPartialAssignment(self): def normal_with_unknown_mean(): loc = ed.Normal(loc=0., scale=1., name="loc") x = ed.Normal(loc=loc, scale=0.5, name="x") return x # Setting only the latents produces the posterior predictive distribution. loc_value = 3. with ed.interception(ed.make_value_setter(loc=loc_value)): x_predictive = normal_with_unknown_mean() self.assertAllEqual(self.evaluate(x_predictive.distribution.mean()), loc_value) # Setting observed values allows calling the log joint as a fn of latents. x_value = 4. def model_with_observed_x(): with ed.interception(ed.make_value_setter(x=x_value)): normal_with_unknown_mean() observed_log_joint_fn = ed.make_log_joint_fn(model_with_observed_x) expected_joint_log_prob = ( tfd.Normal(0., 1.).log_prob(loc_value) + tfd.Normal(loc_value, 0.5).log_prob(x_value)) self.assertEqual(self.evaluate(expected_joint_log_prob), self.evaluate(observed_log_joint_fn(loc=loc_value)))
def testMakeValueSetterSetsValues(self): def normal_with_unknown_mean(): loc = ed.Normal(loc=0., scale=1., name="loc") x = ed.Normal(loc=loc, scale=0.5, name="x") return loc, x loc_value, x_value = 3., 4. with ed.interception(ed.make_value_setter(loc=loc_value, x=x_value)): loc_rv, x_rv = normal_with_unknown_mean() self.assertAllEqual(self.evaluate((loc_rv, x_rv)), (loc_value, x_value))
def PAC2VI(dataSource=tf.keras.datasets.fashion_mnist, NPixels=14, algorithm=0, PARTICLES=20, batch_size=100, num_epochs=50, num_hidden_units=20): """ Run experiments for MAP, Variational, PAC^2-Variational and PAC^2_T-Variational algorithms for the self-supervised classification task with a Categorical data model. Args: dataSource: The data set used in the evaluation. NLabels: The number of labels to predict. NPixels: The size of the images: NPixels\times NPixels. algorithm: Integer indicating the algorithm to be run. 0- MAP Learning 1- Variational Learning 2- PAC^2-Variational Learning 3- PAC^2_T-Variational Learning PARTICLES: Number of Monte-Carlo samples used to compute the posterior prediction distribution. batch_size: Size of the batch. num_epochs: Number of epochs. num_hidden_units: Number of hidden units in the MLP. Returns: NLL: The negative log-likelihood over the test data set. :param algorithm: """ np.random.seed(1) tf.set_random_seed(1) sess = tf.Session() (x_train, y_train), (x_test, y_test) = dataSource.load_data() if (dataSource.__name__.__contains__('cifar')): x_train = sess.run( tf.cast(tf.squeeze(tf.image.rgb_to_grayscale(x_train)), dtype=tf.float32)) x_test = sess.run( tf.cast(tf.squeeze(tf.image.rgb_to_grayscale(x_test)), dtype=tf.float32)) x_train = (x_train < 128).astype(np.int32) x_test = (x_test < 128).astype(np.int32) NPixels = np.int(NPixels / 2) y_train = x_train[:, NPixels:] x_train = x_train[:, 0:NPixels] y_test = x_test[:, NPixels:] x_test = x_test[:, 0:NPixels] NPixels = NPixels * NPixels * 2 N = x_train.shape[0] M = batch_size x_batch = tf.placeholder(dtype=tf.float32, name="x_batch", shape=[None, NPixels]) y_batch = tf.placeholder(dtype=tf.int32, name="y_batch", shape=[None, NPixels]) def model(NHIDDEN, x): W = ed.Normal(loc=tf.zeros([NPixels, NHIDDEN]), scale=1., name="W") b = ed.Normal(loc=tf.zeros([1, NHIDDEN]), scale=1., name="b") W_out = ed.Normal(loc=tf.zeros([NHIDDEN, 2 * NPixels]), scale=1., name="W_out") b_out = ed.Normal(loc=tf.zeros([1, 2 * NPixels]), scale=1., name="b_out") hidden_layer = tf.nn.relu(tf.matmul(x, W) + b) out = tf.matmul(hidden_layer, W_out) + b_out y = ed.Categorical(logits=tf.reshape( out, [tf.shape(x_batch)[0], NPixels, 2]), name="y") return W, b, W_out, b_out, x, y def qmodel(NHIDDEN): W_loc = tf.Variable( tf.random_normal([NPixels, NHIDDEN], 0.0, 0.1, dtype=tf.float32)) b_loc = tf.Variable( tf.random_normal([1, NHIDDEN], 0.0, 0.1, dtype=tf.float32)) if algorithm == 0: W_scale = 0.000001 b_scale = 0.000001 else: W_scale = tf.nn.softplus( tf.Variable( tf.random_normal([NPixels, NHIDDEN], -3., stddev=0.1, dtype=tf.float32))) b_scale = tf.nn.softplus( tf.Variable( tf.random_normal([1, NHIDDEN], -3., stddev=0.1, dtype=tf.float32))) qW = ed.Normal(W_loc, scale=W_scale, name="W") qW_ = ed.Normal(W_loc, scale=W_scale, name="W") qb = ed.Normal(b_loc, scale=b_scale, name="b") qb_ = ed.Normal(b_loc, scale=b_scale, name="b") W_out_loc = tf.Variable( tf.random_normal([NHIDDEN, 2 * NPixels], 0.0, 0.1, dtype=tf.float32)) b_out_loc = tf.Variable( tf.random_normal([1, 2 * NPixels], 0.0, 0.1, dtype=tf.float32)) if algorithm == 0: W_out_scale = 0.000001 b_out_scale = 0.000001 else: W_out_scale = tf.nn.softplus( tf.Variable( tf.random_normal([NHIDDEN, 2 * NPixels], -3., stddev=0.1, dtype=tf.float32))) b_out_scale = tf.nn.softplus( tf.Variable( tf.random_normal([1, 2 * NPixels], -3., stddev=0.1, dtype=tf.float32))) qW_out = ed.Normal(W_out_loc, scale=W_out_scale, name="W_out") qb_out = ed.Normal(b_out_loc, scale=b_out_scale, name="b_out") qW_out_ = ed.Normal(W_out_loc, scale=W_out_scale, name="W_out") qb_out_ = ed.Normal(b_out_loc, scale=b_out_scale, name="b_out") return qW, qW_, qb, qb_, qW_out, qW_out_, qb_out, qb_out_ W, b, W_out, b_out, x, y = model(num_hidden_units, x_batch) qW, qW_, qb, qb_, qW_out, qW_out_, qb_out, qb_out_ = qmodel( num_hidden_units) with ed.interception( ed.make_value_setter(W=qW, b=qb, W_out=qW_out, b_out=qb_out)): pW, pb, pW_out, pb_out, px, py = model(num_hidden_units, x) with ed.interception( ed.make_value_setter(W=qW_, b=qb_, W_out=qW_out_, b_out=qb_out_)): pW_, pb_, pW_out_, pb_out_, px_, py_ = model(num_hidden_units, x) pylogprob = tf.expand_dims( tf.reduce_sum(py.distribution.log_prob(y_batch), axis=1), 1) py_logprob = tf.expand_dims( tf.reduce_sum(py_.distribution.log_prob(y_batch), axis=1), 1) logmax = tf.stop_gradient(tf.math.maximum(pylogprob, py_logprob) + 0.1) logmean_logmax = tf.math.reduce_logsumexp(tf.concat( [pylogprob - logmax, py_logprob - logmax], 1), axis=1) - tf.log(2.) alpha = tf.expand_dims(logmean_logmax, 1) if (algorithm == 3): hmax = 2 * tf.stop_gradient( alpha / tf.math.pow(1 - tf.math.exp(alpha), 2) + tf.math.pow(tf.math.exp(alpha) * (1 - tf.math.exp(alpha)), -1)) else: hmax = 1. var = 0.5 * ( tf.reduce_mean(tf.exp(2 * pylogprob - 2 * logmax) * hmax) - tf.reduce_mean(tf.exp(pylogprob + py_logprob - 2 * logmax) * hmax)) datalikelihood = tf.reduce_mean(pylogprob) logprior = tf.reduce_sum(pW.distribution.log_prob(pW.value)) + \ tf.reduce_sum(pb.distribution.log_prob(pb.value)) + \ tf.reduce_sum(pW_out.distribution.log_prob(pW_out.value)) + \ tf.reduce_sum(pb_out.distribution.log_prob(pb_out.value)) entropy = tf.reduce_sum(qW.distribution.log_prob(qW.value)) + \ tf.reduce_sum(qb.distribution.log_prob(qb.value)) + \ tf.reduce_sum(qW_out.distribution.log_prob(qW_out.value)) + \ tf.reduce_sum(qb_out.distribution.log_prob(qb_out.value)) entropy = -entropy KL = (-entropy - logprior) / N if (algorithm == 2 or algorithm == 3): elbo = datalikelihood + var - KL elif algorithm == 1: elbo = datalikelihood - KL elif algorithm == 0: elbo = datalikelihood + logprior / N verbose = True optimizer = tf.train.AdamOptimizer(0.001) t = [] train = optimizer.minimize(-elbo) init = tf.global_variables_initializer() sess.run(init) for i in range(num_epochs + 1): perm = np.random.permutation(N) x_train = np.take(x_train, perm, axis=0) y_train = np.take(y_train, perm, axis=0) x_batches = np.array_split(x_train, N / M) y_batches = np.array_split(y_train, N / M) for j in range(N // M): batch_x = np.reshape( x_batches[j], [x_batches[j].shape[0], -1]).astype(np.float32) batch_y = np.reshape( y_batches[j], [y_batches[j].shape[0], -1]).astype(np.float32) value, _ = sess.run([elbo, train], feed_dict={ x_batch: batch_x, y_batch: batch_y }) t.append(-value) if verbose: #if j % 1 == 0: print(".", end="", flush=True) if i % 50 == 0 and j % 1000 == 0: #if j >= 5 : print("\nEpoch: " + str(i)) str_elbo = str(t[-1]) print("\n" + str(j) + " epochs\t" + str_elbo, end="", flush=True) print("\n" + str(j) + " data\t" + str( sess.run(datalikelihood, feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " var\t" + str( sess.run(var, feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " KL\t" + str( sess.run(KL, feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " energy\t" + str( sess.run(logprior, feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " entropy\t" + str( sess.run(entropy, feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " hmax\t" + str( sess.run(tf.reduce_mean(hmax), feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " alpha\t" + str( sess.run(tf.reduce_mean(alpha), feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) print("\n" + str(j) + " logmax\t" + str( sess.run(tf.reduce_mean(logmax), feed_dict={ x_batch: batch_x, y_batch: batch_y })), end="", flush=True) M = 1000 N = x_test.shape[0] x_batches = np.array_split(x_test, N / M) y_batches = np.array_split(y_test, N / M) NLL = 0 for j in range(N // M): batch_x = np.reshape(x_batches[j], [x_batches[j].shape[0], -1]).astype(np.float32) batch_y = np.reshape(y_batches[j], [y_batches[j].shape[0], -1]).astype(np.float32) y_pred_list = [] for i in range(PARTICLES): y_pred_list.append( sess.run(pylogprob, feed_dict={ x_batch: batch_x, y_batch: batch_y })) y_preds = np.concatenate(y_pred_list, axis=1) score = tf.reduce_sum( tf.math.reduce_logsumexp(y_preds, axis=1) - tf.log(np.float32(PARTICLES))) score = sess.run(score) NLL = NLL + score if verbose: if j % 1 == 0: print(".", end="", flush=True) if j % 1 == 0: str_elbo = str(score) print("\n" + str(j) + " epochs\t" + str_elbo, end="", flush=True) print("\nNLL: " + str(NLL)) return NLL
def model_with_observed_x(): with ed.interception(ed.make_value_setter(x=x_value)): normal_with_unknown_mean()
t.append(sess.run([elbo])) w_mean_inferred = sess.run(qw_mean) w_stddv_inferred = sess.run(qw_stddv) z_mean_inferred = sess.run(qz_mean) z_stddv_inferred = sess.run(qz_stddv) print("Inferred axes:") print(w_mean_inferred) print("Standard Deviation:") print(w_stddv_inferred) plt.plot(range(1, num_epochs, 5), t) plt.show() with ed.interception(ed.make_value_setter(w=w_mean_inferred, z=z_mean_inferred)): generate = probabilistic_matrix_factorization( data_dim=N, latent_dim=D, num_datapoints=M, stddv_datapoints=stddv_datapoints) with tf.Session() as sess: x_generated, _ = sess.run(generate) plt.scatter(data[:, 0], data[:, 1], color='blue', alpha=0.1, label='Actual data') plt.scatter(x_generated[0, :],
def main(argv): del argv # unused if tf.io.gfile.exists(FLAGS.model_dir): tf.compat.v1.logging.warning( "Warning: deleting old log directory at {}".format( FLAGS.model_dir)) tf.io.gfile.rmtree(FLAGS.model_dir) tf.io.gfile.makedirs(FLAGS.model_dir) tf.compat.v1.enable_eager_execution() grammar = SmilesGrammar() synthetic_data_distribution = ProbabilisticGrammar( grammar=grammar, latent_size=FLAGS.latent_size, num_units=FLAGS.num_units) print("Random examples from synthetic data distribution:") for _ in range(5): productions = synthetic_data_distribution() string = grammar.convert_to_string(productions) print(string) probabilistic_grammar = ProbabilisticGrammar(grammar=grammar, latent_size=FLAGS.latent_size, num_units=FLAGS.num_units) probabilistic_grammar_variational = ProbabilisticGrammarVariational( latent_size=FLAGS.latent_size) checkpoint = tf.train.Checkpoint( synthetic_data_distribution=synthetic_data_distribution, probabilistic_grammar=probabilistic_grammar, probabilistic_grammar_variational=probabilistic_grammar_variational) global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate) writer = tf.compat.v2.summary.create_file_writer(FLAGS.model_dir) writer.set_as_default() start_time = time.time() for step in range(FLAGS.max_steps): productions = synthetic_data_distribution() with tf.GradientTape() as tape: # Sample from amortized variational distribution and record its trace. with ed.tape() as variational_tape: _ = probabilistic_grammar_variational(productions) # Set model trace to take on the data's values and the sample from the # variational distribution. values = {"latent_code": variational_tape["latent_code_posterior"]} values.update({ "production_" + str(t): production for t, production in enumerate(tf.unstack(productions, axis=1)) }) with ed.tape() as model_tape: with ed.interception(ed.make_value_setter(**values)): _ = probabilistic_grammar() # Compute the ELBO given the variational sample, averaged over the batch # size and the number of time steps (number of productions). Although the # ELBO per data point sums over time steps, we average in order to have a # value that remains on the same scale across batches. log_likelihood = 0. for name, rv in six.iteritems(model_tape): if name.startswith("production"): log_likelihood += rv.distribution.log_prob(rv.value) kl = tfp.distributions.kl_divergence( variational_tape["latent_code_posterior"].distribution, model_tape["latent_code"].distribution) timesteps = tf.cast(productions.shape[1], dtype=tf.float32) elbo = tf.reduce_mean(input_tensor=log_likelihood - kl) / timesteps loss = -elbo with tf.compat.v2.summary.record_if( lambda: tf.math.equal(0, global_step % 500)): tf.compat.v2.summary.scalar( "log_likelihood", tf.reduce_mean(input_tensor=log_likelihood) / timesteps, step=global_step) tf.compat.v2.summary.scalar("kl", tf.reduce_mean(input_tensor=kl) / timesteps, step=global_step) tf.compat.v2.summary.scalar("elbo", elbo, step=global_step) variables = (probabilistic_grammar.variables + probabilistic_grammar_variational.variables) grads = tape.gradient(loss, variables) grads_and_vars = zip(grads, variables) optimizer.apply_gradients(grads_and_vars, global_step) if step % 500 == 0: duration = time.time() - start_time print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format( step, loss, duration)) checkpoint.save(file_prefix=FLAGS.model_dir)
def model_fn(features, labels, mode, params, config): """Builds the model function for use in an Estimator. Arguments: features: The input features for the Estimator. labels: The labels, unused here. mode: Signifies whether it is train or test or predict. params: Some hyperparameters as a dictionary. config: The RunConfig, unused here. Returns: EstimatorSpec: A tf.estimator.EstimatorSpec instance. """ del labels, config # Set up the model's learnable parameters. logit_concentration = tf.compat.v1.get_variable( "logit_concentration", shape=[1, params["num_topics"]], initializer=tf.compat.v1.initializers.constant( _softplus_inverse(params["prior_initial_value"]))) concentration = _clip_dirichlet_parameters( tf.nn.softplus(logit_concentration)) num_words = features.shape[1] topics_words_logits = tf.compat.v1.get_variable( "topics_words_logits", shape=[params["num_topics"], num_words], initializer=tf.compat.v1.glorot_normal_initializer()) topics_words = tf.nn.softmax(topics_words_logits, axis=-1) # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. lda_variational, encoder_net = make_lda_variational( params["activation"], params["num_topics"], params["layer_sizes"]) with ed.tape() as variational_tape: _ = lda_variational(features) with ed.tape() as model_tape: with ed.interception( ed.make_value_setter(topics=variational_tape["topics_posterior"])): posterior_predictive = latent_dirichlet_allocation(concentration, topics_words) log_likelihood = posterior_predictive.distribution.log_prob(features) tf.compat.v1.summary.scalar("log_likelihood", tf.reduce_mean(input_tensor=log_likelihood)) # Compute the KL-divergence between two Dirichlets analytically. # The sampled KL does not work well for "sparse" distributions # (see Appendix D of [2]). kl = variational_tape["topics_posterior"].distribution.kl_divergence( model_tape["topics"].distribution) tf.compat.v1.summary.scalar("kl", tf.reduce_mean(input_tensor=kl)) # Ensure that the KL is non-negative (up to a very small slack). # Negative KL can happen due to numerical instability. with tf.control_dependencies( [tf.compat.v1.assert_greater(kl, -1e-3, message="kl")]): kl = tf.identity(kl) elbo = log_likelihood - kl avg_elbo = tf.reduce_mean(input_tensor=elbo) tf.compat.v1.summary.scalar("elbo", avg_elbo) loss = -avg_elbo # Perform variational inference by minimizing the -ELBO. global_step = tf.compat.v1.train.get_or_create_global_step() optimizer = tf.compat.v1.train.AdamOptimizer(params["learning_rate"]) # This implements the "burn-in" for prior parameters (see Appendix D of [2]). # For the first prior_burn_in_steps steps they are fixed, and then trained # jointly with the other parameters. grads_and_vars = optimizer.compute_gradients(loss) grads_and_vars_except_prior = [ x for x in grads_and_vars if x[1] != logit_concentration] def train_op_except_prior(): return optimizer.apply_gradients( grads_and_vars_except_prior, global_step=global_step) def train_op_all(): return optimizer.apply_gradients( grads_and_vars, global_step=global_step) train_op = tf.cond( pred=global_step < params["prior_burn_in_steps"], true_fn=train_op_except_prior, false_fn=train_op_all) # The perplexity is an exponent of the average negative ELBO per word. # words_per_document = tf.reduce_sum(input_tensor=features, axis=1) log_perplexity = -tf.reduce_sum(elbo) / tf.reduce_sum(features) # tf.compat.v1.summary.scalar( # "perplexity", tf.exp(tf.reduce_mean(input_tensor=log_perplexity))) (log_perplexity_tensor, log_perplexity_update) = tf.compat.v1.metrics.mean(log_perplexity) perplexity_tensor = tf.exp(log_perplexity_tensor) # Obtain the topics summary. Implemented as a py_func for simplicity. topics = tf.compat.v1.py_func( functools.partial(get_topics_strings, vocabulary=params["vocabulary"]), [topics_words, concentration], tf.string, stateful=False) tf.compat.v1.summary.text("topics", topics) var_concentration = _clip_dirichlet_parameters(encoder_net(features)) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops={ "elbo": tf.compat.v1.metrics.mean(elbo), "log_likelihood": tf.compat.v1.metrics.mean(log_likelihood), "kl": tf.compat.v1.metrics.mean(kl), "perplexity": (perplexity_tensor, log_perplexity_update), "topics": (topics, tf.no_op()), }, predictions={'topics_posterior_params': var_concentration} )
def main(argv): del argv # unused FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes] if len(FLAGS.layer_sizes) != 3: raise NotImplementedError("Specifying fewer or more than 3 layers is not " "currently available.") if tf.io.gfile.exists(FLAGS.model_dir): tf.compat.v1.logging.warning( "Warning: deleting old log directory at {}".format(FLAGS.model_dir)) tf.io.gfile.rmtree(FLAGS.model_dir) tf.io.gfile.makedirs(FLAGS.model_dir) if FLAGS.fake_data: bag_of_words = np.random.poisson(1., size=[10, 25]) words = [str(i) for i in range(25)] else: bag_of_words, words = load_nips2011_papers(FLAGS.data_dir) total_count = np.sum(bag_of_words) bag_of_words = tf.cast(bag_of_words, dtype=tf.float32) data_size, feature_size = bag_of_words.shape # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational( data_size, feature_size, FLAGS.layer_sizes) with ed.tape() as model_tape: with ed.interception(ed.make_value_setter(w2=qw2, w1=qw1, w0=qw0, z2=qz2, z1=qz1, z0=qz0)): posterior_predictive = deep_exponential_family(data_size, feature_size, FLAGS.layer_sizes, FLAGS.shape) log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words) log_likelihood = tf.reduce_sum(input_tensor=log_likelihood) tf.compat.v1.summary.scalar("log_likelihood", log_likelihood) # Compute analytic KL-divergence between variational and prior distributions. kl = 0. for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2), ("w0", qw0), ("w1", qw1), ("w2", qw2)]: kl += tf.reduce_sum( input_tensor=variational_rv.distribution.kl_divergence( model_tape[rv_name].distribution)) tf.compat.v1.summary.scalar("kl", kl) elbo = log_likelihood - kl tf.compat.v1.summary.scalar("elbo", elbo) optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate) train_op = optimizer.minimize(-elbo) sess = tf.compat.v1.Session() summary = tf.compat.v1.summary.merge_all() summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.model_dir, sess.graph) start_time = time.time() sess.run(tf.compat.v1.global_variables_initializer()) for step in range(FLAGS.max_steps): start_time = time.time() _, elbo_value = sess.run([train_op, elbo]) if step % 500 == 0: duration = time.time() - start_time print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format( step, elbo_value, duration)) summary_str = sess.run(summary) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Compute perplexity of the full data set. The model's negative # log-likelihood of data is upper bounded by the variational objective. negative_log_likelihood = -elbo_value perplexity = np.exp(negative_log_likelihood / total_count) print("Negative log-likelihood <= {:0.3f}".format( negative_log_likelihood)) print("Perplexity <= {:0.3f}".format(perplexity)) # Print top 10 words for first 10 topics. qw0_values = sess.run(qw0) for k in range(min(10, FLAGS.layer_sizes[-1])): top_words_idx = qw0_values[k, :].argsort()[-10:][::-1] top_words = " ".join([words[i] for i in top_words_idx]) print("Topic {}: {}".format(k, top_words))
stddv_datapoints=stddv_datapoints, w=w, z=z, x=x_train) energy = -target(w, z) optimizer = tf.train.AdamOptimizer(learning_rate=0.05) train = optimizer.minimize(energy) init = tf.global_variables_initializer() t = [] num_epochs = 200 with tf.Session() as sess: sess.run(init) for i in range(num_epochs): sess.run(train) if i % 5 == 0: cE, cw, cz = sess.run([energy, w, z]) t.append(cE) w_inferred_map = sess.run(w) z_inferred_map = sess.run(z) with ed.interception(ed.make_value_setter(w=w_inferred_map, z=z_inferred_map)): generate = probabilistic_pca(data_dim=data_dim, latent_dim=latent_dim, num_datapoints=num_datapoints, stddv_datapoints=stddv_datapoints) with tf.Session() as sess: x_generated, _ = sess.run(generate) plt.imshow(x_generated) plt.show()