def to_noncentered(centered_state): set_values = ed_transforms.make_value_setter(*centered_state) with ed.tape() as noncentered_tape: with ed.interception(ed_transforms.ncp): with ed.interception(set_values): model(*model_args) return [tf.identity(v) for v in list(noncentered_tape.values())[:-1]]
def _sanity_check_conversion(self, model, model_args, observed, to_cp, to_ncp, make_to_cp): with ed.tape() as model_tape: model(*model_args) model_tape_ = self.evaluate(model_tape) example_params = model_tape_.values()[:-1] # Test that `make_to_cp`, when given the centered parameterization as the # source, generates the identity fn. param_names = [ p for v in model_tape_.keys() for p in (v + '_a', v + '_b') ] centered_parameterization = {p: 1. for p in param_names} identity_cp = make_to_cp(**centered_parameterization) example_params_copy = identity_cp(example_params) c1_ = self.evaluate(example_params_copy) c2_ = self.evaluate(example_params_copy) self.assertAllClose(c1_, c2_) self.assertAllClose(c1_, example_params) # Test that `to_ncp` and `to_cp` are deterministic and consistent ncp_params = to_ncp(example_params) cp_params = to_cp(ncp_params) ncp_params_, cp_params_ = self.evaluate((ncp_params, cp_params)) ncp_params2_, cp_params2_ = self.evaluate((ncp_params, cp_params)) # Test determinism self.assertAllClose(ncp_params_, ncp_params2_) self.assertAllClose(cp_params_, cp_params2_) # Test round-trip consistency: self.assertAllClose(cp_params_, example_params)
def loop_body(i): # trace the model to draw a single joint sample with ed.tape() as model_tape: model(*model_args) # pfor works with Tensors only, so extract RV values values = collections.OrderedDict( (k, rv.value) for k, rv in model_tape.items()) return values
def to_centered(uncentered_state): set_values = ed_transforms.make_value_setter(*uncentered_state) with ed.interception(set_values): with ed.interception(parametrisation): with ed.tape() as centered_tape: model(*model_args) return [tf.identity(v) for v in list(centered_tape.values())[:-1]]
def _build_model(self): with contextmanager.randvar_registry.init(self.graph): with contextmanager.layer_registry.init(): # use edward2 model tape to capture RandomVariable declarations with ed.tape() as model_tape: self.builder() # store the losses from the build layers through layers.sequential.Sequential # NOTE: this must be done inside the layer_registry context, where the sequentials are stored self.layer_losses = contextmanager.layer_registry.get_losses() # get variables from parameters var_parameters = contextmanager.randvar_registry.get_var_parameters( ) # wrap captured edward2 RVs into inferpy RVs model_vars = OrderedDict() for k, v in model_tape.items(): registered_rv = contextmanager.randvar_registry.get_variable(k) if registered_rv is None: # a ed Random Variable. Create a inferpy Random Variable and assign the var directly. # do not know the args and kwars used to build the ed random variable. Use None. model_vars[k] = RandomVariable(v, name=k, is_datamodel=False, ed_cls=None, var_args=None, var_kwargs=None, sample_shape=()) else: model_vars[k] = registered_rv return model_vars, var_parameters
def run_parametrised_hmc(model_config, interceptor, num_samples=2000, burnin=1000, num_leapfrog_steps=4, num_adaptation_steps=500, num_optimization_steps=2000): """Given a (centred) model, this function transforms it based on the provided interceptor, and runs HMC on the reparameterised model. """ def model_ncp(*params): with ed.interception(interceptor): return model_config.model(*params) log_joint_noncentered = ed.make_log_joint_fn(model_ncp) with ed.tape() as model_tape: _ = model_ncp(*model_config.model_args) param_shapes = collections.OrderedDict() target_ncp_kwargs = {} for param in model_tape.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape[param].shape else: target_ncp_kwargs[param] = model_config.observed_data[param] def target_ncp(*param_args): i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_ncp_kwargs[param] = param_args[i] i = i + 1 return log_joint_noncentered(*model_config.model_args, **target_ncp_kwargs) stepsize_kwargs = {'num_leapfrog_steps': num_leapfrog_steps} stepsize_kwargs = {'num_optimization_steps': num_optimization_steps} for key in model_config.observed_data: stepsize_kwargs[key] = model_config.observed_data[key] (step_size_init_ncp, stepsize_elbo_ncp, vi_time) = util.approximate_mcmc_step_size(model_ncp, *model_config.model_args, **stepsize_kwargs) results = _run_hmc( target_ncp, param_shapes, step_size_init=step_size_init_ncp, transform=model_config.to_centered, num_samples=num_samples, burnin=burnin, num_adaptation_steps=num_adaptation_steps, num_leapfrog_steps=num_leapfrog_steps) results['elbo'] = stepsize_elbo_ncp results['vi_time'] = vi_time return results
def make_cvip_graph(model_config, parameterisation_type='exp', tied_pparams=False): """ Constructs the cVIP graph of the given model. Resets the default TF graph. """ tf.reset_default_graph() results = collections.OrderedDict() (learnable_parameters, learnable_parametrisation, _) = ed_transforms.make_learnable_parametrisation( tau=1., parameterisation_type=parameterisation_type, tied_pparams=tied_pparams) def model_vip(*params): with ed.interception(learnable_parametrisation): return model_config.model(*params) if model_config.bijectors_fn is not None: model_vip = ed_transforms.transform_with_bijectors( model_vip, model_config.bijectors_fn) log_joint_vip = ed.make_log_joint_fn(model_vip) # log_joint_fn with ed.tape() as model_tape: _ = model_vip(*model_config.model_args) target_vip_kwargs = {} for param in model_tape.keys(): if param in model_config.observed_data.keys(): target_vip_kwargs[param] = model_config.observed_data[param] def target_vip(*param_args): # latent_log_joint_fn i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_vip_kwargs[param] = param_args[i] i = i + 1 return log_joint_vip(*model_config.model_args, **target_vip_kwargs) #full_kwargs = collections.OrderedDict(model_config.observed_data.items()) #full_kwargs['parameterisation'] = collections.OrderedDict() #for k in learnable_parameters.keys(): # full_kwargs['parameterisation'][k] = learnable_parameters[k] elbo, variational_parameters = util.get_mean_field_elbo( model_vip, target_vip, num_mc_samples=FLAGS.num_mc_samples, model_args=model_config.model_args, model_obs_kwargs=model_config.observed_data, vi_kwargs={'parameterisation': learnable_parameters}) #vi_kwargs=full_kwargs return target_vip, model_vip, elbo, variational_parameters, learnable_parameters
def testTapeNoName(self): def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1.) return x + y with ed.tape() as model_tape: _ = model() self.assertEqual(list(six.iterkeys(model_tape)), ["x"])
def testTapeNoName(self): def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1.) return x + y with ed.tape() as model_tape: _ = model() self.assertEqual(list(six.iterkeys(model_tape)), ["x"])
def to_noncentered(centered_state): set_values = ed_transforms.make_value_setter(*centered_state) with ed.tape() as noncentered_tape: with ed.interception(ed_transforms.ncp): with ed.interception(set_values): model(*model_args) param_vals = [ tf.identity(v) for k, v in noncentered_tape.items() if k not in observed_data.keys() ] return param_vals
def run_centered_hmc(model_config, num_samples=2000, burnin=1000, num_leapfrog_steps=4, num_adaptation_steps=500, num_optimization_steps=2000): """Runs HMC on the provided (centred) model.""" tf.compat.v1.reset_default_graph() log_joint_centered = ed.make_log_joint_fn(model_config.model) with ed.tape() as model_tape: _ = model_config.model(*model_config.model_args) param_shapes = collections.OrderedDict() target_cp_kwargs = {} for param in model_tape.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape[param].shape else: target_cp_kwargs[param] = model_config.observed_data[param] def target_cp(*param_args): i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_cp_kwargs[param] = param_args[i] i = i + 1 return log_joint_centered(*model_config.model_args, **target_cp_kwargs) stepsize_kwargs = {'num_leapfrog_steps': num_leapfrog_steps} stepsize_kwargs = {'num_optimization_steps': num_optimization_steps} for key in model_config.observed_data: stepsize_kwargs[key] = model_config.observed_data[key] (step_size_init_cp, stepsize_elbo_cp, vi_time) = util.approximate_mcmc_step_size(model_config.model, *model_config.model_args, **stepsize_kwargs) results = _run_hmc( target_cp, param_shapes, step_size_init=step_size_init_cp, num_samples=num_samples, burnin=burnin, num_adaptation_steps=num_adaptation_steps, num_leapfrog_steps=num_leapfrog_steps) results['elbo'] = stepsize_elbo_cp results['vi_time'] = vi_time return results
def testTape(self): def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1., name="y") return x + y with ed.tape() as model_tape: output = model() expected_value, actual_value = self.evaluate([ model_tape["x"] + model_tape["y"], output]) self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"]) self.assertEqual(expected_value, actual_value)
def testTape(self): def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1., name="y") return x + y with ed.tape() as model_tape: output = model() expected_value, actual_value = self.evaluate([ model_tape["x"] + model_tape["y"], output]) self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"]) self.assertEqual(expected_value, actual_value)
def make_dvip_graph(model_config, reparam, parameterisation_type='exp'): """ Constructs the dVIP graph of the given model, where `reparam` is a cVIP reparameterisation obtained previously. Resets the default TF graph. """ tf.reset_default_graph() results = collections.OrderedDict() _, insightful_parametrisation, _ = ed_transforms.make_learnable_parametrisation( learnable_parameters=reparam, parameterisation_type=parameterisation_type) def model_vip(*params): with ed.interception(insightful_parametrisation): return model_config.model(*params) if model_config.bijectors_fn is not None: model_vip = ed_transforms.transform_with_bijectors( model_vip, model_config.bijectors_fn) log_joint_vip = ed.make_log_joint_fn(model_vip) # log_joint_fn with ed.tape() as model_tape: _ = model_vip(*model_config.model_args) target_vip_kwargs = {} for param in model_tape.keys(): if param in model_config.observed_data.keys(): target_vip_kwargs[param] = model_config.observed_data[param] def target_vip(*param_args): # latent_log_joint_fn i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_vip_kwargs[param] = param_args[i] i = i + 1 return log_joint_vip(*model_config.model_args, **target_vip_kwargs) elbo, variational_parameters = util.get_mean_field_elbo( model_vip, target_vip, num_mc_samples=FLAGS.num_mc_samples, model_args=model_config.model_args, model_obs_kwargs=model_config.observed_data, vi_kwargs={'parameterisation': reparam}) return target_vip, model_vip, elbo, variational_parameters, None
def testTapeInnerForwarding(self): def double(f, *args, **kwargs): return 2. * ed.interceptable(f)(*args, **kwargs) def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1., name="y") return x + y with ed.interception(double): with ed.tape() as model_tape: output = model() expected_value, actual_value = self.evaluate([ model_tape["x"] + model_tape["y"], output]) self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"]) self.assertEqual(expected_value, actual_value)
def make_ncp_graph(model_config): """ Constructs the CP graph of the given model. Resets the default TF graph. """ tf.reset_default_graph() interceptor = ed_transforms.ncp def model_ncp(*params): with ed.interception(interceptor): return model_config.model(*params) if model_config.bijectors_fn is not None: model_ncp = ed_transforms.transform_with_bijectors( model_ncp, model_config.bijectors_fn) log_joint_noncentered = ed.make_log_joint_fn(model_ncp) with ed.tape() as model_tape: _ = model_ncp(*model_config.model_args) target_ncp_kwargs = {} for param in model_tape.keys(): if param in model_config.observed_data.keys(): target_ncp_kwargs[param] = model_config.observed_data[param] def target_ncp(*param_args): i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_ncp_kwargs[param] = param_args[i] i = i + 1 return log_joint_noncentered(*model_config.model_args, **target_ncp_kwargs) elbo, variational_parameters = util.get_mean_field_elbo( model_config.model, target_ncp, num_mc_samples=FLAGS.num_mc_samples, model_args=model_config.model_args, model_obs_kwargs=model_config.observed_data, vi_kwargs=None) return target_ncp, model_ncp, elbo, variational_parameters, None
def testTapeInnerForwarding(self): def double(f, *args, **kwargs): return 2. * ed.interceptable(f)(*args, **kwargs) def model(): x = ed.Normal(loc=0., scale=1., name="x") y = ed.Normal(loc=x, scale=1., name="y") return x + y with ed.interception(double): with ed.tape() as model_tape: output = model() expected_value, actual_value = self.evaluate([ model_tape["x"] + model_tape["y"], output]) self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"]) self.assertEqual(expected_value, actual_value)
def _build_model(self): # get the global variables defined before building the model _before_global_variables = tf.global_variables() with contextmanager.randvar_registry.init(self.graph): # use edward2 model tape to capture RandomVariable declarations with ed.tape() as model_tape: self.builder() # get variables from parameters var_parameters = contextmanager.randvar_registry.get_var_parameters( ) # wrap captured edward2 RVs into inferpy RVs model_vars = OrderedDict() for k, v in model_tape.items(): registered_rv = contextmanager.randvar_registry.get_variable(k) if registered_rv is None: # a ed Random Variable. Create a inferpy Random Variable and assign the var directly. # do not know the args and kwars used to build the ed random variable. Use None. model_vars[k] = RandomVariable(v, name=k, is_datamodel=False, ed_cls=None, var_args=None, var_kwargs=None, sample_shape=()) else: model_vars[k] = registered_rv # get the global variables defined after building the model _after_global_variables = tf.global_variables() # compute the new global variables defined when building the model created_vars = [ v for v in _after_global_variables if v not in _before_global_variables ] util.get_session().run(tf.variables_initializer(created_vars)) return model_vars, var_parameters
def make_cp_graph(model_config): """ Constructs the CP graph of the given model. Resets the default TF graph. """ tf.reset_default_graph() log_joint_centered = ed.make_log_joint_fn(model_config.model) with ed.tape() as model_tape: _ = model_config.model(*model_config.model_args) param_shapes = collections.OrderedDict() target_cp_kwargs = {} for param in model_tape.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape[param].shape else: target_cp_kwargs[param] = model_config.observed_data[param] def target_cp(*param_args): i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_cp_kwargs[param] = param_args[i] i = i + 1 return log_joint_centered(*model_config.model_args, **target_cp_kwargs) elbo, variational_parameters = util.get_mean_field_elbo( model_config.model, target_cp, num_mc_samples=FLAGS.num_mc_samples, model_args=model_config.model_args, model_obs_kwargs=model_config.observed_data, vi_kwargs=None) return target_cp, model_config.model, elbo, variational_parameters, None
def _make_likelihood(rv_dict, model): """Produces optimizable tensor for model likelihood. Args: rv_dict: (dict of RandomVariable) Dictionary of random variables representing variational family for each model parameter. model: (Model) A model that contains definition, likelihood and training labels. Returns: log_likelihood: (tf.Tensor) A likelihood tensor with registered gradient with respect to VI parameters. outcome_rv: (ed.RandomVariable) A random variable representing model's predictive distribution. model_tape: (ContextManager) A ContextManager recording the model variables in model graph. """ with ed.tape() as model_tape: with ed.interception(model_util.make_value_setter(**rv_dict)): outcome_rv = model.definition() log_likelihood = model.likelihood(outcome_rv, model.outcome_obs) return log_likelihood, outcome_rv, model_tape
return set_values DATA_SIZE = 100 FEATURE_SIZE = 41 UNITS = [23, 7, 2] SHAPE = 0.1 x, w2, w1, w0, z2, z1, z0 = deep_exponential_family(DATA_SIZE, FEATURE_SIZE, UNITS, SHAPE) qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational( w2, w1, w0, z2, z1, z0) # x_sample = np.random.poisson(5., size=[DATA_SIZE, FEATURE_SIZE]) # 生成虚拟的训练数据,size与模型匹配 x_sample = tf.placeholder(tf.float32, shape=[DATA_SIZE, FEATURE_SIZE]) # 可以用placeholder占位符 with ed.tape() as model_tape: with ed.interception( make_value_setter(w2=qw2, w1=qw1, w0=qw0, z2=qz2, z1=qz1, z0=qz0)): # 对分布的参数用后验分布进行替换,生成后验分布 posterior_predictive, _, _, _, _, _, _ = deep_exponential_family( DATA_SIZE, FEATURE_SIZE, UNITS, SHAPE) log_likelihood = posterior_predictive.distribution.log_prob(x_sample) print(log_likelihood) # log_likelihood为根据x_sample计算的对数似然函数 # 损失函数的定义,用变分法 kl = 0. for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2), ("w0", qw0), ("w1", qw1), ("w2", qw2)]: # rv_name代表先验分布的name # variational_rv代表后验分布的名字 kl += tf.reduce_sum(
def model_fn(features, labels, mode, params, config): """Builds the model function for use in an Estimator. Arguments: features: The input features for the Estimator. labels: The labels, unused here. mode: Signifies whether it is train or test or predict. params: Some hyperparameters as a dictionary. config: The RunConfig, unused here. Returns: EstimatorSpec: A tf.estimator.EstimatorSpec instance. """ del labels, config # Set up the model's learnable parameters. logit_concentration = tf.get_variable( "logit_concentration", shape=[1, params["num_topics"]], initializer=tf.constant_initializer( _softplus_inverse(params["prior_initial_value"]))) concentration = _clip_dirichlet_parameters( tf.nn.softplus(logit_concentration)) num_words = features.shape[1] topics_words_logits = tf.get_variable( "topics_words_logits", shape=[params["num_topics"], num_words], initializer=tf.glorot_normal_initializer()) topics_words = tf.nn.softmax(topics_words_logits, axis=-1) # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. lda_variational = make_lda_variational( params["activation"], params["num_topics"], params["layer_sizes"]) with ed.tape() as variational_tape: _ = lda_variational(features) with ed.tape() as model_tape: with ed.interception( make_value_setter(topics=variational_tape["topics_posterior"])): posterior_predictive = latent_dirichlet_allocation(concentration, topics_words) log_likelihood = posterior_predictive.distribution.log_prob(features) tf.summary.scalar("log_likelihood", tf.reduce_mean(log_likelihood)) # Compute the KL-divergence between two Dirichlets analytically. # The sampled KL does not work well for "sparse" distributions # (see Appendix D of [2]). kl = variational_tape["topics_posterior"].distribution.kl_divergence( model_tape["topics"].distribution) tf.summary.scalar("kl", tf.reduce_mean(kl)) # Ensure that the KL is non-negative (up to a very small slack). # Negative KL can happen due to numerical instability. with tf.control_dependencies([tf.assert_greater(kl, -1e-3, message="kl")]): kl = tf.identity(kl) elbo = log_likelihood - kl avg_elbo = tf.reduce_mean(elbo) tf.summary.scalar("elbo", avg_elbo) loss = -avg_elbo # Perform variational inference by minimizing the -ELBO. global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(params["learning_rate"]) # This implements the "burn-in" for prior parameters (see Appendix D of [2]). # For the first prior_burn_in_steps steps they are fixed, and then trained # jointly with the other parameters. grads_and_vars = optimizer.compute_gradients(loss) grads_and_vars_except_prior = [ x for x in grads_and_vars if x[1] != logit_concentration] def train_op_except_prior(): return optimizer.apply_gradients( grads_and_vars_except_prior, global_step=global_step) def train_op_all(): return optimizer.apply_gradients( grads_and_vars, global_step=global_step) train_op = tf.cond( global_step < params["prior_burn_in_steps"], true_fn=train_op_except_prior, false_fn=train_op_all) # The perplexity is an exponent of the average negative ELBO per word. words_per_document = tf.reduce_sum(features, axis=1) log_perplexity = -elbo / words_per_document tf.summary.scalar("perplexity", tf.exp(tf.reduce_mean(log_perplexity))) (log_perplexity_tensor, log_perplexity_update) = tf.metrics.mean( log_perplexity) perplexity_tensor = tf.exp(log_perplexity_tensor) # Obtain the topics summary. Implemented as a py_func for simplicity. topics = tf.py_func( functools.partial(get_topics_strings, vocabulary=params["vocabulary"]), [topics_words, concentration], tf.string, stateful=False) tf.summary.text("topics", topics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops={ "elbo": tf.metrics.mean(elbo), "log_likelihood": tf.metrics.mean(log_likelihood), "kl": tf.metrics.mean(kl), "perplexity": (perplexity_tensor, log_perplexity_update), "topics": (topics, tf.no_op()), }, )
def main(argv): del argv # unused if tf.gfile.Exists(FLAGS.model_dir): tf.logging.warning("Warning: deleting old log directory at {}".format( FLAGS.model_dir)) tf.gfile.DeleteRecursively(FLAGS.model_dir) tf.gfile.MakeDirs(FLAGS.model_dir) tf.enable_eager_execution() grammar = SmilesGrammar() synthetic_data_distribution = ProbabilisticGrammar( grammar=grammar, latent_size=FLAGS.latent_size, num_units=FLAGS.num_units) print("Random examples from synthetic data distribution:") for _ in range(5): productions = synthetic_data_distribution() string = grammar.convert_to_string(productions) print(string) probabilistic_grammar = ProbabilisticGrammar(grammar=grammar, latent_size=FLAGS.latent_size, num_units=FLAGS.num_units) probabilistic_grammar_variational = ProbabilisticGrammarVariational( latent_size=FLAGS.latent_size) checkpoint = tf.train.Checkpoint( synthetic_data_distribution=synthetic_data_distribution, probabilistic_grammar=probabilistic_grammar, probabilistic_grammar_variational=probabilistic_grammar_variational) global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) writer = tf.contrib.summary.create_file_writer(FLAGS.model_dir) writer.set_as_default() start_time = time.time() for step in range(FLAGS.max_steps): productions = synthetic_data_distribution() with tf.GradientTape() as tape: # Sample from amortized variational distribution and record its trace. with ed.tape() as variational_tape: _ = probabilistic_grammar_variational(productions) # Set model trace to take on the data's values and the sample from the # variational distribution. values = {"latent_code": variational_tape["latent_code_posterior"]} values.update({ "production_" + str(t): production for t, production in enumerate(tf.unstack(productions, axis=1)) }) with ed.tape() as model_tape: with ed.interception(make_value_setter(**values)): _ = probabilistic_grammar() # Compute the ELBO given the variational sample, averaged over the batch # size and the number of time steps (number of productions). Although the # ELBO per data point sums over time steps, we average in order to have a # value that remains on the same scale across batches. log_likelihood = 0. for name, rv in six.iteritems(model_tape): if name.startswith("production"): log_likelihood += rv.distribution.log_prob(rv.value) kl = tfp.distributions.kl_divergence( variational_tape["latent_code_posterior"].distribution, model_tape["latent_code"].distribution) timesteps = tf.to_float(productions.shape[1]) elbo = tf.reduce_mean(log_likelihood - kl) / timesteps loss = -elbo with tf.contrib.summary.record_summaries_every_n_global_steps(500): tf.contrib.summary.scalar( "log_likelihood", tf.reduce_mean(log_likelihood) / timesteps) tf.contrib.summary.scalar("kl", tf.reduce_mean(kl) / timesteps) tf.contrib.summary.scalar("elbo", elbo) variables = (probabilistic_grammar.variables + probabilistic_grammar_variational.variables) grads = tape.gradient(loss, variables) grads_and_vars = zip(grads, variables) optimizer.apply_gradients(grads_and_vars, global_step) if step % 500 == 0: duration = time.time() - start_time print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format( step, loss, duration)) checkpoint.save(file_prefix=FLAGS.model_dir)
def model_fn(features, labels, mode, params, config): """Builds the model function for use in an Estimator. Arguments: features: The input features for the Estimator. labels: The labels, unused here. mode: Signifies whether it is train or test or predict. params: Some hyperparameters as a dictionary. config: The RunConfig, unused here. Returns: EstimatorSpec: A tf.estimator.EstimatorSpec instance. """ del labels, config # Set up the model's learnable parameters. logit_concentration = tf.get_variable( "logit_concentration", shape=[1, params["num_topics"]], initializer=tf.constant_initializer( _softplus_inverse(params["prior_initial_value"]))) concentration = _clip_dirichlet_parameters( tf.nn.softplus(logit_concentration)) num_words = features.shape[1] topics_words_logits = tf.get_variable( "topics_words_logits", shape=[params["num_topics"], num_words], initializer=tf.glorot_normal_initializer()) topics_words = tf.nn.softmax(topics_words_logits, axis=-1) # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. lda_variational = make_lda_variational(params["activation"], params["num_topics"], params["layer_sizes"]) with ed.tape() as variational_tape: _ = lda_variational(features) with ed.tape() as model_tape: with ed.interception( make_value_setter( topics=variational_tape["topics_posterior"])): posterior_predictive = latent_dirichlet_allocation( concentration, topics_words) log_likelihood = posterior_predictive.distribution.log_prob(features) tf.summary.scalar("log_likelihood", tf.reduce_mean(log_likelihood)) # Compute the KL-divergence between two Dirichlets analytically. # The sampled KL does not work well for "sparse" distributions # (see Appendix D of [2]). kl = variational_tape["topics_posterior"].distribution.kl_divergence( model_tape["topics"].distribution) tf.summary.scalar("kl", tf.reduce_mean(kl)) # Ensure that the KL is non-negative (up to a very small slack). # Negative KL can happen due to numerical instability. with tf.control_dependencies([tf.assert_greater(kl, -1e-3, message="kl")]): kl = tf.identity(kl) elbo = log_likelihood - kl avg_elbo = tf.reduce_mean(elbo) tf.summary.scalar("elbo", avg_elbo) loss = -avg_elbo # Perform variational inference by minimizing the -ELBO. global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(params["learning_rate"]) # This implements the "burn-in" for prior parameters (see Appendix D of [2]). # For the first prior_burn_in_steps steps they are fixed, and then trained # jointly with the other parameters. grads_and_vars = optimizer.compute_gradients(loss) grads_and_vars_except_prior = [ x for x in grads_and_vars if x[1] != logit_concentration ] def train_op_except_prior(): return optimizer.apply_gradients(grads_and_vars_except_prior, global_step=global_step) def train_op_all(): return optimizer.apply_gradients(grads_and_vars, global_step=global_step) train_op = tf.cond(global_step < params["prior_burn_in_steps"], true_fn=train_op_except_prior, false_fn=train_op_all) # The perplexity is an exponent of the average negative ELBO per word. words_per_document = tf.reduce_sum(features, axis=1) log_perplexity = -elbo / words_per_document tf.summary.scalar("perplexity", tf.exp(tf.reduce_mean(log_perplexity))) (log_perplexity_tensor, log_perplexity_update) = tf.metrics.mean(log_perplexity) perplexity_tensor = tf.exp(log_perplexity_tensor) # Obtain the topics summary. Implemented as a py_func for simplicity. topics = tf.py_func(functools.partial(get_topics_strings, vocabulary=params["vocabulary"]), [topics_words, concentration], tf.string, stateful=False) tf.summary.text("topics", topics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops={ "elbo": tf.metrics.mean(elbo), "log_likelihood": tf.metrics.mean(log_likelihood), "kl": tf.metrics.mean(kl), "perplexity": (perplexity_tensor, log_perplexity_update), "topics": (topics, tf.no_op()), }, )
def main(argv): del argv # unused FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes] if len(FLAGS.layer_sizes) != 3: raise NotImplementedError("Specifying fewer or more than 3 layers is not " "currently available.") if tf.gfile.Exists(FLAGS.model_dir): tf.logging.warning( "Warning: deleting old log directory at {}".format(FLAGS.model_dir)) tf.gfile.DeleteRecursively(FLAGS.model_dir) tf.gfile.MakeDirs(FLAGS.model_dir) if FLAGS.fake_data: bag_of_words = np.random.poisson(1., size=[10, 25]) words = [str(i) for i in range(25)] else: bag_of_words, words = load_nips2011_papers(FLAGS.data_dir) total_count = np.sum(bag_of_words) bag_of_words = tf.to_float(bag_of_words) data_size, feature_size = bag_of_words.shape # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational( data_size, feature_size, FLAGS.layer_sizes) with ed.tape() as model_tape: with ed.interception(make_value_setter(w2=qw2, w1=qw1, w0=qw0, z2=qz2, z1=qz1, z0=qz0)): posterior_predictive = deep_exponential_family(data_size, feature_size, FLAGS.layer_sizes, FLAGS.shape) log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words) log_likelihood = tf.reduce_sum(log_likelihood) tf.summary.scalar("log_likelihood", log_likelihood) # Compute analytic KL-divergence between variational and prior distributions. kl = 0. for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2), ("w0", qw0), ("w1", qw1), ("w2", qw2)]: kl += tf.reduce_sum(variational_rv.distribution.kl_divergence( model_tape[rv_name].distribution)) tf.summary.scalar("kl", kl) elbo = log_likelihood - kl tf.summary.scalar("elbo", elbo) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) train_op = optimizer.minimize(-elbo) sess = tf.Session() summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.model_dir, sess.graph) start_time = time.time() sess.run(tf.global_variables_initializer()) for step in range(FLAGS.max_steps): start_time = time.time() _, elbo_value = sess.run([train_op, elbo]) if step % 500 == 0: duration = time.time() - start_time print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format( step, elbo_value, duration)) summary_str = sess.run(summary) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Compute perplexity of the full data set. The model's negative # log-likelihood of data is upper bounded by the variational objective. negative_log_likelihood = -elbo_value perplexity = np.exp(negative_log_likelihood / total_count) print("Negative log-likelihood <= {:0.3f}".format( negative_log_likelihood)) print("Perplexity <= {:0.3f}".format(perplexity)) # Print top 10 words for first 10 topics. qw0_values = sess.run(qw0) for k in range(min(10, FLAGS.layer_sizes[-1])): top_words_idx = qw0_values[k, :].argsort()[-10:][::-1] top_words = " ".join([words[i] for i in top_words_idx]) print("Topic {}: {}".format(k, top_words))
def run_vip_hmc_continuous(model_config, num_samples=2000, burnin=1000, use_iaf_posterior=False, num_leapfrog_steps=4, num_adaptation_steps=500, num_optimization_steps=2000, num_mc_samples=32, tau=1., do_sample=True, description='', experiments_dir=''): tf.reset_default_graph() if use_iaf_posterior: # IAF posterior doesn't give us stddevs for step sizes for HMC (we could # extract them by sampling but I haven't implemented that), and we mostly # care about it for ELBOs anyway. do_sample = False init_val_loc = tf.placeholder('float', shape=()) init_val_scale = tf.placeholder('float', shape=()) (learnable_parameters, learnable_parametrisation, _) = ed_transforms.make_learnable_parametrisation( init_val_loc=init_val_loc, init_val_scale=init_val_scale, tau=tau) def model_vip(*params): with ed.interception(learnable_parametrisation): return model_config.model(*params) log_joint_vip = ed.make_log_joint_fn(model_vip) with ed.tape() as model_tape: _ = model_vip(*model_config.model_args) param_shapes = collections.OrderedDict() target_vip_kwargs = {} for param in model_tape.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape[param].shape else: target_vip_kwargs[param] = model_config.observed_data[param] def target_vip(*param_args): i = 0 for param in model_tape.keys(): if param not in model_config.observed_data.keys(): target_vip_kwargs[param] = param_args[i] i = i + 1 return log_joint_vip(*model_config.model_args, **target_vip_kwargs) full_kwargs = collections.OrderedDict(model_config.observed_data.items()) full_kwargs['parameterisation'] = collections.OrderedDict() for k in learnable_parameters.keys(): full_kwargs['parameterisation'][k] = learnable_parameters[k] if use_iaf_posterior: elbo = util.get_iaf_elbo( target_vip, num_mc_samples=num_mc_samples, param_shapes=param_shapes) variational_parameters = {} else: elbo, variational_parameters = util.get_mean_field_elbo( model_vip, target_vip, num_mc_samples=num_mc_samples, model_args=model_config.model_args, vi_kwargs=full_kwargs) vip_step_size_approx = util.get_approximate_step_size( variational_parameters, num_leapfrog_steps) ############################################################################## best_elbo = None model_dir = os.path.join(experiments_dir, str(description + '_' + model_config.model.__name__)) if not tf.gfile.Exists(model_dir): tf.gfile.MakeDirs(model_dir) saver = tf.train.Saver() dir_save = os.path.join(model_dir, 'saved_params_{}'.format(gen_id())) if not tf.gfile.Exists(dir_save): tf.gfile.MakeDirs(dir_save) best_lr = None best_init_loc = None best_init_scale = None learning_rate_ph = tf.placeholder(shape=[], dtype=tf.float32) learning_rate = tf.Variable(learning_rate_ph, trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) train = optimizer.minimize(-elbo) init = tf.global_variables_initializer() learning_rates = [0.003, 0.01, 0.01, 0.1, 0.003, 0.01] if use_iaf_posterior: learning_rates = [3e-5, 1e-4, 3e-4, 1e-4] start_time = time.time() for learning_rate_val in learning_rates: for init_loc in [0.]: #, 10., -10.]: for init_scale in [init_loc]: timeline = [] with tf.Session() as sess: init.run(feed_dict={init_val_loc: init_loc, init_val_scale: init_scale, learning_rate_ph: learning_rate_val}) this_timeline = [] for i in range(num_optimization_steps): _, e = sess.run([train, elbo]) if np.isnan(e): util.print('got NaN in ELBO optimization, stopping...') break this_timeline.append(e) this_elbo = np.mean(this_timeline[-100:]) info_str = ('finished cVIP optimization with elbo {} vs ' 'best ELBO {}'.format(this_elbo, best_elbo)) util.print(info_str) if best_elbo is None or best_elbo < this_elbo: best_elbo = this_elbo timeline = this_timeline vals = sess.run(list(learnable_parameters.values())) learned_reparam = collections.OrderedDict( zip(learnable_parameters.keys(), vals)) vals = sess.run(list(variational_parameters.values())) learned_variational_params = collections.OrderedDict( zip(variational_parameters.keys(), vals)) util.print('learned params {}'.format(learned_reparam)) util.print('learned variational params {}'.format( learned_variational_params)) _ = saver.save(sess, dir_save) best_lr = learning_rate best_init_loc = init_loc best_init_scale = init_scale vi_time = time.time() - start_time util.print('BEST: LR={}, init={}, {}'.format(best_lr, best_init_loc, best_init_scale)) util.print('ELBO: {}'.format(best_elbo)) to_centered = model_config.make_to_centered(**learned_reparam) results = collections.OrderedDict() results['elbo'] = best_elbo with tf.Session() as sess: saver.restore(sess, dir_save) results['vp'] = learned_variational_params if do_sample: vip_step_size_init = sess.run(vip_step_size_approx) vip_step_size = [tf.get_variable( name='step_size_vip'+str(i), initializer=np.array(vip_step_size_init[i], dtype=np.float32), use_resource=True, # For TFE compatibility. trainable=False) for i in range(len(vip_step_size_init))] kernel_vip = mcmc.HamiltonianMonteCarlo( target_log_prob_fn=target_vip, step_size=vip_step_size, num_leapfrog_steps=num_leapfrog_steps, step_size_update_fn=mcmc.make_simple_step_size_update_policy( num_adaptation_steps=num_adaptation_steps, target_rate=0.85)) states, kernel_results_vip = mcmc.sample_chain( num_results=num_samples, num_burnin_steps=burnin, current_state=[ tf.zeros(param_shapes[param]) for param in param_shapes.keys() ], kernel=kernel_vip, num_steps_between_results=1) states_vip = transform_mcmc_states(states, to_centered) init_again = tf.global_variables_initializer() init_again.run(feed_dict={ init_val_loc: best_init_loc, init_val_scale: best_init_scale, learning_rate_ph: 1.0}) # learning rate doesn't matter for HMC. ess_vip = tfp.mcmc.effective_sample_size(states_vip) start_time = time.time() samples, is_accepted, ess, ss_vip, log_accept_ratio = sess.run( (states_vip, kernel_results_vip.is_accepted, ess_vip, kernel_results_vip.extra.step_size_assign, kernel_results_vip.log_accept_ratio)) sampling_time = time.time() - start_time results['samples'] = collections.OrderedDict() results['is_accepted'] = is_accepted results['acceptance_rate'] = np.sum(is_accepted) * 100. / float( num_samples) results['ess'] = ess results['sampling_time'] = sampling_time results['log_accept_ratio'] = log_accept_ratio results['step_size'] = [s[0] for s in ss_vip] i = 0 for param in param_shapes.keys(): results['samples'][param] = samples[i] i = i + 1 # end if results['parameterisation'] = collections.OrderedDict() i = 0 for param in param_shapes.keys(): name_a = param[:-5] + 'a' name_b = param[:-5] + 'b' try: results['parameterisation'][name_a] = learned_reparam[name_a] results['parameterisation'][name_b] = learned_reparam[name_b] except KeyError: continue i = i + 1 results['elbo_timeline'] = timeline results['vi_time'] = vi_time results['init_pos'] = best_init_loc return results
def run_interleaved_hmc(model_config, num_samples=2000, step_size_cp=0.1, step_size_ncp=0.1, burnin=1000, num_leapfrog_steps=4): """Given a (centred) model, this function transforms it to a fully non-centred one, and uses both models to run interleaved HMC. """ tf.reset_default_graph() log_joint_centered = ed.make_log_joint_fn(model_config.model) with ed.tape() as model_tape_cp: _ = model_config.model(*model_config.model_args) param_shapes = collections.OrderedDict() target_cp_kwargs = {} for param in model_tape_cp.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape_cp[param].shape else: target_cp_kwargs[param] = model_config.observed_data[param] def target_cp(*param_args): i = 0 for param in model_tape_cp.keys(): if param not in model_config.observed_data.keys(): target_cp_kwargs[param] = param_args[i] i = i + 1 return log_joint_centered(*model_config.model_args, **target_cp_kwargs) def model_noncentered(*params): with ed.interception(ed_transforms.ncp): return model_config.model(*params) log_joint_noncentered = ed.make_log_joint_fn(model_noncentered) with ed.tape() as model_tape_ncp: _ = model_noncentered(*model_config.model_args) param_shapes = collections.OrderedDict() target_ncp_kwargs = {} for param in model_tape_ncp.keys(): if param not in model_config.observed_data.keys(): param_shapes[param] = model_tape_ncp[param].shape else: target_ncp_kwargs[param] = model_config.observed_data[param] def target_ncp(*param_args): i = 0 for param in model_tape_ncp.keys(): if param not in model_config.observed_data.keys(): target_ncp_kwargs[param] = param_args[i] i = i + 1 return log_joint_noncentered(*model_config.model_args, **target_ncp_kwargs) return _run_hmc_interleaved(target_cp, target_ncp, param_shapes, to_centered=model_config.to_centered, to_noncentered=model_config.to_noncentered, num_samples=num_samples, step_size_cp=step_size_cp, step_size_ncp=step_size_ncp, burnin=burnin, num_leapfrog_steps=num_leapfrog_steps)
def run_interleaved_hmc(model_config, results_dir, file_path): filename_cp = 'CP.json' filename_ncp = 'NCP.json' file_path_cp = os.path.join(results_dir, filename_cp) file_path_ncp = os.path.join(results_dir, filename_ncp) with ed.tape() as model_tape: model_config.model(*model_config.model_args) param_names = [ k for k in list(model_tape.keys()) if k not in model_config.observed_data ] if tf.io.gfile.exists(file_path_cp) and tf.io.gfile.exists(file_path_ncp): with tf.io.gfile.GFile(file_path_cp, 'r') as f: prev_results = json.load(f) initial_step_size_cp = prev_results['initial_step_size'] num_leapfrog_steps_cp = get_best_num_leapfrog_steps_from_tuning_runs( prev_results['tuning_runs']) learned_variational_params_cp = prev_results[ 'learned_variational_params'] with tf.io.gfile.GFile(file_path_ncp, 'r') as f: prev_results = json.load(f) initial_step_size_ncp = prev_results['initial_step_size'] num_leapfrog_steps_ncp = get_best_num_leapfrog_steps_from_tuning_runs( prev_results['tuning_runs']) else: raise Exception('Run VI first to find initial step sizes, and HMC' 'first to find num_leapfrog_steps.') initial_states_cp = util.variational_inits_from_params( learned_variational_params_cp, param_names=param_names, num_inits=FLAGS.num_chains).values() best_ess_min = 0 best_num_ls = None results = () for num_ls in set([num_leapfrog_steps_ncp, num_leapfrog_steps_cp]): util.print('\nNumber of leaprog steps is set to {}.\n'.format( FLAGS.num_leapfrog_steps)) FLAGS.num_leapfrog_steps = num_ls + num_ls (ess_min, sem_min, acceptance_rate_cp, acceptance_rate_ncp, mcmc_time, samples, normalized_ess_final) = run_interleaved_hmc_with_leapfrog_steps( model_config=model_config, results_dir=results_dir, num_leapfrog_steps_cp=num_ls, num_leapfrog_steps_ncp=num_ls, initial_step_size_cp=initial_step_size_cp, initial_step_size_ncp=initial_step_size_ncp, initial_states_cp=initial_states_cp) if ess_min.item() > best_ess_min: best_ess_min = ess_min.item() best_num_ls = num_ls results = (ess_min, sem_min, acceptance_rate_cp, acceptance_rate_ncp, mcmc_time, samples, normalized_ess_final) (ess_min, sem_min, acceptance_rate_cp, acceptance_rate_ncp, mcmc_time, samples, normalized_ess_final) = results FLAGS.num_leapfrog_steps = best_num_ls + best_num_ls save_hmc_results(file_path=file_path, initial_step_size_ncp=initial_step_size_ncp, initial_step_size_cp=nitial_step_size_cp, num_leapfrog_steps=best_num_ls, ess_min=ess_min.item(), sem_min=sem_min.item(), acceptance_rate_cp=acceptance_rate_cp.item(), acceptance_rate_ncp=acceptance_rate_ncp.item(), mcmc_time_sec=mcmc_time) save_ess(file_path_base=file_path[:-5], samples=samples, param_names=param_names, normalized_ess_final=normalized_ess_final, num_chains_to_save=FLAGS.num_chains_to_save)
def run_hmc(model_config, results_dir, file_path, tuning=False): if tf.io.gfile.exists(file_path): with tf.io.gfile.GFile(file_path, 'r') as f: prev_results = json.load(f) else: raise Exception('Run VI first to find initial step sizes') with ed.tape() as model_tape: model_config.model(*model_config.model_args) param_names = [ k for k in list(model_tape.keys()) if k not in model_config.observed_data ] initial_step_size = prev_results['initial_step_size'] initial_states = util.variational_inits_from_params( prev_results['learned_variational_params'], param_names=param_names, num_inits=FLAGS.num_chains).values() if tuning: if not FLAGS.num_leapfrog_steps: raise ValueError( 'You must specify the number of leapfrog steps for a ' 'tuning run.') for existing_run in prev_results.get('tuning_runs', []): if existing_run['num_leapfrog_steps'] == FLAGS.num_leapfrog_steps: print( 'A tuning run already exists for HMC with {} leapfrog steps ', 'skipping. ({})'.format(FLAGS.num_leapfrog_steps, existing_run)) return if not FLAGS.num_leapfrog_steps: FLAGS.num_leapfrog_steps = get_best_num_leapfrog_steps_from_tuning_runs( prev_results['tuning_runs']) util.print('\nNumber of leaprog steps is set to {}.\n'.format( FLAGS.num_leapfrog_steps)) if FLAGS.count_in_leapfrog_steps: FLAGS.num_samples = int(FLAGS.num_samples / float(FLAGS.num_leapfrog_steps)) FLAGS.num_burnin_steps = int(FLAGS.num_burnin_steps / float(FLAGS.num_leapfrog_steps)) FLAGS.num_adaptation_steps = int(FLAGS.num_adaptation_steps / float(FLAGS.num_leapfrog_steps)) (target, _, elbo, variational_parameters, learnable_parameters, actual_reparam) = create_target_graph(model_config, results_dir) (states_orig, kernel_results, states, ess) = inference.hmc(target, model_config, initial_step_size, initial_states=initial_states, reparam=(actual_reparam if actual_reparam is not None else learned_reparam)) init = tf.compat.v1.global_variables_initializer() with tf.compat.v1.Session() as sess: #sess = tf_debug.LocalCLIDebugWrapperSession( # sess, dump_root="/usr/local/google/tmp/tfdbg") init.run() start_time = time.time() samples, is_accepted, ess_final, samples_orig = sess.run( (states, kernel_results.inner_results.is_accepted, ess, states_orig)) mcmc_time = time.time() - start_time normalized_ess_final = [] for ess_ in ess_final: # report effective samples per 1000 gradient evals normalized_ess_final.append( 1000 * ess_ / (FLAGS.num_samples * FLAGS.num_leapfrog_steps)) del ess_final ess_min, sem_min = util.get_min_ess(normalized_ess_final) util.print('ESS per 1000 gradients: {} +/- {}'.format(ess_min, sem_min)) acceptance_rate = (np.sum(is_accepted) * 100. / float(FLAGS.num_samples * FLAGS.num_chains)) if tuning: save_hmc_results(file_path=file_path, tuning_runs={ 'num_leapfrog_steps': FLAGS.num_leapfrog_steps, 'ess_min': ess_min.item(), 'sem_min': sem_min.item(), 'acceptance_rate': acceptance_rate.item(), 'mcmc_time': mcmc_time, 'num_samples': FLAGS.num_samples, 'num_burnin_steps': FLAGS.num_burnin_steps }) else: save_hmc_results(file_path=file_path, ess_min=ess_min.item(), sem_min=sem_min.item(), acceptance_rate=acceptance_rate.item(), mcmc_time_sec=mcmc_time) save_ess(file_path_base=file_path[:-5], samples=samples, param_names=param_names, normalized_ess_final=normalized_ess_final, num_chains_to_save=FLAGS.num_chains_to_save)
def main(argv): del argv # unused FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes] if len(FLAGS.layer_sizes) != 3: raise NotImplementedError( "Specifying fewer or more than 3 layers is not " "currently available.") if tf.io.gfile.exists(FLAGS.model_dir): tf.compat.v1.logging.warning( "Warning: deleting old log directory at {}".format( FLAGS.model_dir)) tf.io.gfile.rmtree(FLAGS.model_dir) tf.io.gfile.makedirs(FLAGS.model_dir) if FLAGS.fake_data: bag_of_words = np.random.poisson(1., size=[10, 25]) words = [str(i) for i in range(25)] else: bag_of_words, words = load_nips2011_papers(FLAGS.data_dir) total_count = np.sum(bag_of_words) bag_of_words = tf.cast(bag_of_words, dtype=tf.float32) data_size, feature_size = bag_of_words.shape # Compute expected log-likelihood. First, sample from the variational # distribution; second, compute the log-likelihood given the sample. qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational( data_size, feature_size, FLAGS.layer_sizes) with ed.tape() as model_tape: with ed.interception( make_value_setter(w2=qw2, w1=qw1, w0=qw0, z2=qz2, z1=qz1, z0=qz0)): posterior_predictive = deep_exponential_family( data_size, feature_size, FLAGS.layer_sizes, FLAGS.shape) log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words) log_likelihood = tf.reduce_sum(input_tensor=log_likelihood) tf.compat.v1.summary.scalar("log_likelihood", log_likelihood) # Compute analytic KL-divergence between variational and prior distributions. kl = 0. for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2), ("w0", qw0), ("w1", qw1), ("w2", qw2)]: kl += tf.reduce_sum(input_tensor=variational_rv.distribution. kl_divergence(model_tape[rv_name].distribution)) tf.compat.v1.summary.scalar("kl", kl) elbo = log_likelihood - kl tf.compat.v1.summary.scalar("elbo", elbo) optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate) train_op = optimizer.minimize(-elbo) sess = tf.compat.v1.Session() summary = tf.compat.v1.summary.merge_all() summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.model_dir, sess.graph) start_time = time.time() sess.run(tf.compat.v1.global_variables_initializer()) for step in range(FLAGS.max_steps): start_time = time.time() _, elbo_value = sess.run([train_op, elbo]) if step % 500 == 0: duration = time.time() - start_time print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format( step, elbo_value, duration)) summary_str = sess.run(summary) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Compute perplexity of the full data set. The model's negative # log-likelihood of data is upper bounded by the variational objective. negative_log_likelihood = -elbo_value perplexity = np.exp(negative_log_likelihood / total_count) print("Negative log-likelihood <= {:0.3f}".format( negative_log_likelihood)) print("Perplexity <= {:0.3f}".format(perplexity)) # Print top 10 words for first 10 topics. qw0_values = sess.run(qw0) for k in range(min(10, FLAGS.layer_sizes[-1])): top_words_idx = qw0_values[k, :].argsort()[-10:][::-1] top_words = " ".join([words[i] for i in top_words_idx]) print("Topic {}: {}".format(k, top_words))