def analyze(f, title="Plot"): X, Y, groups = f() Y_data = np.hstack([Y, groups]) likelihood = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(variance=1.0), gpflow.likelihoods.Gaussian(variance=1.0) ]) # model construction (notice that num_latent_gps is 1) natgrad = NaturalGradient(gamma=1.0) adam = tf.optimizers.Adam() kernel = gpflow.kernels.Matern52(lengthscales=0.5) model = gpflow.models.VGP((X, Y_data), kernel=kernel, likelihood=likelihood, num_latent_gps=1) # here's a plot of the raw data. fig, ax = plt.subplots(1, 1, figsize=(12, 6)) _ = ax.plot(X, Y_data, "kx") plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title + '.png') for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) # let's do some plotting! xx = np.linspace(0, 30, 200)[:, None] mu, var = model.predict_f(xx) plt.figure(figsize=(12, 6)) plt.plot(xx, mu, "C0") plt.plot(xx, mu + 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(xx, mu - 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(X, Y, "C1x", mew=2) plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title + ' GP model.png') print_summary(model) # print(type(summary)) # summary.to_markdown(title+'.md') # plt.set_xlim(0, 30) # _ = ax.plot(xx, 2.5 * np.sin(6 * xx) + np.cos(3 * xx), "C2--") # plt.errorbar( # X.squeeze(), # Y.squeeze(), # # yerr=2 * (np.sqrt(NoiseVar)).squeeze(), # marker="x", # lw=0, # elinewidth=1.0, # color="C1", # ) # _ = plt.xlim(-5, 5) return
def optimize(self): set_trainable(self.model.q_mu, False) set_trainable(self.model.q_sqrt, False) variational_params = [(self.model.q_mu, self.model.q_sqrt)] adam_opt = tf.optimizers.Adam(1e-3) natgrad_opt = NaturalGradient(gamma=0.1) for step in range(100): natgrad_opt.minimize(self.model.training_loss, var_list=variational_params) adam_opt.minimize(self.model.training_loss, var_list=self.model.trainable_variables)
def __init__(self, variational_model: bool = True, do_monitor: bool = False): self.var = variational_model self.do_monitor = do_monitor if do_monitor: self.monitor_path = "train_log/fit" os.system("rm -rf train_log") if variational_model: self.opt = tf.optimizers.Adam() self.opt_var = NaturalGradient(gamma=0.1) else: self.opt = Scipy()
def assert_gpr_vs_vgp( m1: gpflow.models.BayesianModel, m2: gpflow.models.BayesianModel, gamma: float = 1.0, maxiter: int = 1, xi_transform: Optional[gpflow.optimizers.natgrad.XiTransform] = None, ): assert maxiter >= 1 m1_ll_before = m1.training_loss() m2_ll_before = m2.training_loss() assert_different(m2_ll_before, m1_ll_before) params = (m2.q_mu, m2.q_sqrt) if xi_transform is not None: params += (xi_transform, ) opt = NaturalGradient(gamma) @tf.function def minimize_step(): opt.minimize(m2.training_loss, var_list=[params]) for _ in range(maxiter): minimize_step() m1_ll_after = m1.training_loss() m2_ll_after = m2.training_loss() np.testing.assert_allclose(m1_ll_after, m2_ll_after, atol=1e-4)
def assert_gpr_vs_vgp( m1: tf.Module, m2: tf.Module, gamma: float = 1.0, maxiter: int = 1, xi_transform: Optional[gpflow.optimizers.natgrad.XiTransform] = None): assert maxiter >= 1 m2_ll_before = m2.log_likelihood() m1_ll_before = m1.log_likelihood() assert m2_ll_before != m1_ll_before @tf.function(autograph=False) def loss_cb() -> tf.Tensor: return -m2.log_marginal_likelihood() params = (m2.q_mu, m2.q_sqrt) if xi_transform is not None: params += (xi_transform, ) opt = NaturalGradient(gamma) @tf.function(autograph=False) def minimize_step(): opt.minimize(loss_cb, var_list=[params]) for _ in range(maxiter): minimize_step() m2_ll_after = m2.log_likelihood() m1_ll_after = m1.log_likelihood() np.testing.assert_allclose(m1_ll_after, m2_ll_after, atol=1e-4)
def analyze(f, title="Plot", rawplot=True, modelplot=True,summary=True): # Obtain randomly generated data X, Y, groups = f() Y_data = np.hstack([Y, groups]) # Model construction (notice that num_latent_gps is 1) likelihood = gpflow.likelihoods.SwitchedLikelihood( [gpflow.likelihoods.Gaussian(variance=1.0), gpflow.likelihoods.Gaussian(variance=1.0)] ) natgrad = NaturalGradient(gamma=1.0) adam = tf.optimizers.Adam() kernel = gpflow.kernels.Matern52(lengthscales=0.5) model = gpflow.models.VGP((X, Y_data), kernel=kernel, likelihood=likelihood, num_latent_gps=1) for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) # Plot of the raw data. if rawplot: fig, ax = plt.subplots(1, 1, figsize=(12, 6)) _ = ax.plot(X, Y_data, "kx") plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title+'.png') # Plot of GP model if modelplot: xx = np.linspace(0, 30, 200)[:, None] mu, var = model.predict_f(xx) plt.figure(figsize=(12, 6)) plt.plot(xx, mu, "C0") plt.plot(xx, mu + 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(xx, mu - 2 * np.sqrt(var), "C0", lw=0.5) plt.plot(X, Y, "C1x", mew=2) plt.xlabel("Minutes") plt.ylabel("Value") plt.title(title) plt.savefig(title+' GP model.png') if summary: print_summary(model) return model
def assert_sgpr_vs_svgp( m1: gpflow.models.BayesianModel, m2: gpflow.models.BayesianModel, ): data = m1.data m1_ll_before = m1.training_loss() m2_ll_before = m2.training_loss(data) assert_different(m2_ll_before, m1_ll_before) params = [(m2.q_mu, m2.q_sqrt)] opt = NaturalGradient(1.0) opt.minimize(m2.training_loss_closure(data), var_list=params) m1_ll_after = m1.training_loss() m2_ll_after = m2.training_loss(data) np.testing.assert_allclose(m1_ll_after, m2_ll_after, atol=1e-4)
def assert_sgpr_vs_svgp(m1: tf.Module, m2: tf.Module): data = m1.data m1_ll_before = m1.log_likelihood() m2_ll_before = m2.log_likelihood(data[0], data[1]) assert m2_ll_before != m1_ll_before @tf.function(autograph=False) def loss_cb() -> tf.Tensor: return -m2.log_marginal_likelihood(data[0], data[1]) params = [(m2.q_mu, m2.q_sqrt)] opt = NaturalGradient(1.) opt.minimize(loss_cb, var_list=params) m1_ll_after = m1.log_likelihood() m2_ll_after = m2.log_likelihood(data[0], data[1]) np.testing.assert_allclose(m1_ll_after, m2_ll_after, atol=1e-4)
def train_natgrad_adam(model, approx=False, num_iterations=2000, log_freq=10): natgrad_opt = NaturalGradient(gamma=1.0) adam_opt = tf.optimizers.Adam(learning_rate=0.01) variational_params = list(zip(model.q_mu, model.q_sqrt)) gpflow.set_trainable(model.q_mu, False) gpflow.set_trainable(model.q_sqrt, False) if approx: variational_params.append((model.q_mu_s, model.q_sqrt_s)) gpflow.set_trainable(model.q_mu_s, False) gpflow.set_trainable(model.q_sqrt_s, False) @tf.function def optimization_step(): natgrad_opt.minimize(model.training_loss, var_list=variational_params) adam_opt.minimize(model.training_loss, var_list=model.trainable_variables) #return (model.elbo(), model.Fq) return model.elbo() log_elbo = [] #log_Fq = [] # log_predY = [] tol = 1e-4 print('initial elbo {:.4f}'.format(model.elbo())) for step in range(num_iterations): start_time = time.time() #elbo, Fq = optimization_step() elbo = optimization_step() log_elbo.append(elbo) #log_Fq.append(Fq.numpy()) # log_predY.append(pred_Y.numpy()) if step > 0 and np.abs(elbo - log_elbo[-2]) < tol: print('converge at iteration {} elbo {:.4f}'.format(step+1, elbo)) break if (step + 1) % log_freq == 0: print('iteration {} elbo {:.4f}, took {:.4f}s'.format(step+1, elbo, time.time()-start_time)) #return (log_elbo, log_Fq) return log_elbo
likelihood=gpflow.likelihoods.Gaussian()) # %% [markdown] # The log marginal likelihood lower bound (evidence lower bound or ELBO) of the approximate GP model is: # %% vgp.elbo().numpy() # %% [markdown] # Obviously, our initial guess for the variational distribution is not correct, which results in a lower bound to the likelihood of the exact GPR model. We can optimize the variational parameters in order to get a tighter bound. # %% [markdown] # In fact, we only need to take **one step** in the natural gradient direction to recover the exact posterior: # %% natgrad_opt = NaturalGradient(gamma=1.0) variational_params = [(vgp.q_mu, vgp.q_sqrt)] natgrad_opt.minimize(vgp.training_loss, var_list=variational_params) # %% [markdown] # The ELBO of the approximate GP model after a single NatGrad step: # %% vgp.elbo().numpy() # %% [markdown] # ### Optimize both variational parameters and kernel hyperparameters together # # In the Gaussian likelihood case we can iterate between an Adam update for the hyperparameters and a NatGrad update for the variational parameters. That way, we achieve optimization of hyperparameters as if the model were a GPR. # %% [markdown]
def __init__(self, xin, yin, nInput, nOutput, xlb, xub, seed=None, batch_size=50, inducing_fraction=0.2, min_inducing=100, gp_lengthscale_bounds=(1e-6, 100.0), gp_likelihood_sigma=1.0e-4, natgrad_gamma=0.1, adam_lr=0.01, n_iter=30000, min_elbo_pct_change=1.0, num_latent_gps=None, logger=None): if not _has_gpflow: raise RuntimeError( 'SIV_Matern requires the GPflow library to be installed.') self.nInput = nInput self.nOutput = nOutput self.xlb = xlb self.xub = xub self.xrng = np.where(np.isclose(xub - xlb, 0., rtol=1e-6, atol=1e-6), 1., xub - xlb) self.logger = logger N = xin.shape[0] D = xin.shape[1] xn = np.zeros_like(xin) for i in range(N): xn[i, :] = (xin[i, :] - self.xlb) / self.xrng if nOutput == 1: yin = yin.reshape((yin.shape[0], 1)) if num_latent_gps is None: num_latent_gps = nOutput self.y_train_mean = np.asarray( [np.mean(yin[:, i]) for i in range(yin.shape[1])], dtype=np.float32) self.y_train_std = np.asarray([ handle_zeros_in_scale(np.std(yin[:, i], axis=0), copy=False) for i in range(yin.shape[1]) ], dtype=np.float32) # Remove mean and make unit variance yn = np.column_stack( tuple((yin[:, i] - self.y_train_mean[i]) / self.y_train_std[i] for i in range(yin.shape[1]))) adam_opt = tf.optimizers.Adam(adam_lr) natgrad_opt = NaturalGradient(gamma=natgrad_gamma) autotune = tf.data.experimental.AUTOTUNE if logger is not None: logger.info(f"SIV_Matern: creating regressor for output...") for i in range(nOutput): logger.info( f"SIV_Matern: y_{i+1} range is {(np.min(yin[:,i]), np.max(yin[:,i]))}" ) data = (np.asarray(xn, dtype=np.float64), yn.astype(np.float64)) M = int(round(inducing_fraction * N)) if M < min_inducing: Z = xn.copy() else: Z = xn[np.random.choice(N, size=M, replace=False), :].copy( ) # Initialize inducing locations to M random inputs iv = gpflow.inducing_variables.SharedIndependentInducingVariables( gpflow.inducing_variables.InducingPoints(Z)) kernel = gpflow.kernels.Matern52() gp_kernel = gpflow.kernels.SharedIndependent(kernel, output_dim=nOutput) gp_likelihood = gpflow.likelihoods.Gaussian( variance=gp_likelihood_sigma) gp_model = gpflow.models.SVGP(inducing_variable=iv, kernel=gp_kernel, likelihood=gp_likelihood, num_data=N, num_latent_gps=num_latent_gps) gp_model.kernel.kernel.lengthscales = bounded_parameter( np.asarray([gp_lengthscale_bounds[0]] * nInput, dtype=np.float64), np.asarray([gp_lengthscale_bounds[1]] * nInput, dtype=np.float64), np.ones(nInput, dtype=np.float64), trainable=True, name='lengthscales') gpflow.set_trainable(gp_model.q_mu, False) gpflow.set_trainable(gp_model.q_sqrt, False) gpflow.set_trainable(gp_model.inducing_variable, False) if logger is not None: logger.info(f"SIV_Matern: optimizing regressor...") variational_params = [(gp_model.q_mu, gp_model.q_sqrt)] data_minibatch = (tf.data.Dataset.from_tensor_slices(data).prefetch( autotune).repeat().shuffle(N).batch(batch_size)) data_minibatch_it = iter(data_minibatch) svgp_natgrad_loss = gp_model.training_loss_closure(data_minibatch_it, compile=True) @tf.function def optim_step(): natgrad_opt.minimize(svgp_natgrad_loss, var_list=variational_params) adam_opt.minimize(svgp_natgrad_loss, var_list=gp_model.trainable_variables) iterations = ci_niter(n_iter) elbo_log = [] diff_kernel = np.array([1, -1]) for it in range(iterations): optim_step() if (it % 10 == 0): likelihood = -svgp_natgrad_loss().numpy() elbo_log.append(likelihood) if (it % 1000 == 0): logger.info( f"SIV_Matern: iteration {it} likelihood: {likelihood:.04f}" ) if it >= 2000: elbo_change = np.convolve(elbo_log, diff_kernel, 'same')[1:] elbo_pct_change = (elbo_change / np.abs(elbo_log[1:])) * 100 mean_elbo_pct_change = np.mean(elbo_pct_change[-100:]) if (it % 1000 == 0): logger.info( f"SIV_Matern: iteration {it} mean elbo pct change: {mean_elbo_pct_change:.04f}" ) if mean_elbo_pct_change < min_elbo_pct_change: logger.info( f"SIV_Matern: likelihood change at iteration {it+1} is less than {min_elbo_pct_change} percent" ) break print_summary(gp_model) self.sm = gp_model
def __init__(self, xin, yin, nInput, nOutput, xlb, xub, seed=None, gp_lengthscale_bounds=(1e-6, 100.0), gp_likelihood_sigma=1.0e-4, natgrad_gamma=1.0, adam_lr=0.01, n_iter=3000, min_elbo_pct_change=0.1, logger=None): if not _has_gpflow: raise RuntimeError( 'VGP_Matern requires the GPflow library to be installed.') self.nInput = nInput self.nOutput = nOutput self.xlb = xlb self.xub = xub self.xrng = np.where(np.isclose(xub - xlb, 0., rtol=1e-6, atol=1e-6), 1., xub - xlb) self.logger = logger N = xin.shape[0] xn = np.zeros_like(xin) for i in range(N): xn[i, :] = (xin[i, :] - self.xlb) / self.xrng if nOutput == 1: yin = yin.reshape((yin.shape[0], 1)) self.y_train_mean = np.asarray( [np.mean(yin[:, i]) for i in range(yin.shape[1])], dtype=np.float32) self.y_train_std = np.asarray([ handle_zeros_in_scale(np.std(yin[:, i], axis=0), copy=False) for i in range(yin.shape[1]) ], dtype=np.float32) # Remove mean and make unit variance yn = np.column_stack( tuple((yin[:, i] - self.y_train_mean[i]) / self.y_train_std[i] for i in range(yin.shape[1]))) adam_opt = tf.optimizers.Adam(adam_lr) natgrad_opt = NaturalGradient(gamma=natgrad_gamma) smlist = [] for i in range(nOutput): if logger is not None: logger.info( f"VGP_Matern: creating regressor for output {i+1} of {nOutput}..." ) logger.info( f"VGP_Matern: y_{i} range is {(np.min(yin[:,i]), np.max(yin[:,i]))}..." ) gp_kernel = gpflow.kernels.Matern52() gp_likelihood = gpflow.likelihoods.Gaussian( variance=gp_likelihood_sigma) gp_model = gpflow.models.VGP( data=(np.asarray(xn, dtype=np.float64), yn[:, i].reshape( (-1, 1)).astype(np.float64)), kernel=gp_kernel, likelihood=gp_likelihood, ) gp_model.kernel.lengthscales = bounded_parameter( np.asarray([gp_lengthscale_bounds[0]] * nInput, dtype=np.float64), np.asarray([gp_lengthscale_bounds[1]] * nInput, dtype=np.float64), np.ones(nInput, dtype=np.float64), trainable=True, name='lengthscales') gpflow.set_trainable(gp_model.q_mu, False) gpflow.set_trainable(gp_model.q_sqrt, False) if logger is not None: logger.info( f"VGP_Matern: optimizing regressor for output {i+1} of {nOutput}..." ) variational_params = [(gp_model.q_mu, gp_model.q_sqrt)] iterations = ci_niter(n_iter) elbo_log = [] diff_kernel = np.array([1, -1]) @tf.function def optim_step(): natgrad_opt.minimize(gp_model.training_loss, var_list=variational_params) adam_opt.minimize(gp_model.training_loss, var_list=gp_model.trainable_variables) for it in range(iterations): optim_step() likelihood = gp_model.elbo() if (it % 100 == 0): logger.info( f"VGP_Matern: iteration {it} likelihood: {likelihood:.04f}" ) elbo_log.append(likelihood) if it >= 200: elbo_change = np.convolve(elbo_log, diff_kernel, 'same')[1:] elbo_pct_change = (elbo_change / np.abs(elbo_log[1:])) * 100 mean_elbo_pct_change = np.mean(elbo_pct_change[-100:]) if mean_elbo_pct_change < min_elbo_pct_change: logger.info( f"VGP_Matern: likelihood change at iteration {it+1} is less than {min_elbo_pct_change} percent" ) break print_summary(gp_model) #assert(opt_log.success) smlist.append(gp_model) self.smlist = smlist
class Trainer(): def __init__(self, variational_model: bool = True, do_monitor: bool = False): self.var = variational_model self.do_monitor = do_monitor if do_monitor: self.monitor_path = "train_log/fit" os.system("rm -rf train_log") if variational_model: self.opt = tf.optimizers.Adam() self.opt_var = NaturalGradient(gamma=0.1) else: self.opt = Scipy() def run(self, model, dataset, epoch: int = 10): num_iter = len(dataset) * epoch #something not trainable set_trainable(model.inducing_variable, False) set_trainable(model.q_mu, False) set_trainable(model.q_sqrt, False) if self.do_monitor: self.create_monitor(model) if self.var: train_iter = iter(dataset) training_loss = model.training_loss_closure(train_iter, compile=True) for step in tf.range(num_iter): self.optimization_step(model, training_loss) self.monitor(step) else: data = dataset.unbatch() self.opt.minimize(model.training_loss_closure(data), variables=model.trainable_variables, options={ "disp": True, "maxiter": 1e3 }) @tf.function def optimization_step(self, model, loss): self.opt.minimize(loss, par_list=model.trainable_variables) self.opt_var.minimize(loss, var_list=[model.q_mu, model.q_sqrt]) def create_monitor(self, model): model_task = ModelToTensorBoard(self.monitor_path, model) self.monitor = Monitor(MonitorTaskGroup([model_task]), period=5) # data_minibatch = ( # tf.data.Dataset.from_tensor_slices(data) # .prefetch(autotune) # .repeat() # .shuffle(N) # .batch(batch_size) # ) #nat grad loop # gamma_start = 1e-2 # deliberately chosen to be too large for this example # gamma_max = 1e-1 # same max value as before # gamma_step = 1e-2 # this is much more aggressive increase # gamma = tf.Variable(gamma_start, dtype=tf.float64) # gamma_incremented = tf.where(tf.less(gamma, gamma_max), gamma + gamma_step, gamma_max) # op_ng = NatGradOptimizer(gamma).make_optimize_tensor(model, var_list=[[model.q_mu, model.q_sqrt]]) # op_adam = AdamOptimizer(0.001).make_optimize_tensor(model) # op_increment_gamma = tf.assign(gamma, gamma_incremented) # gamma_fallback = 1e-1 # we'll reduce by this factor if there's a cholesky failure # op_fallback_gamma = tf.assign(gamma, gamma * gamma_fallback) # sess.run(tf.variables_initializer([gamma])) # for it in range(1000): # try: # sess.run(op_ng) # sess.run(op_increment_gamma) # except tf.errors.InvalidArgumentError: # g = sess.run(gamma) # print('gamma = {} on iteration {} is too big! Falling back to {}'.format(it, g, g * gamma_fallback)) # sess.run(op_fallback_gamma) # sess.run(op_adam) # if it % 100 == 0: # print('{} gamma={:.4f} ELBO={:.4f}'.format(it, *sess.run([gamma, model.likelihood_tensor])))
NoiseVar = 2 * np.exp(-((X - 2) ** 2) / 4) + 0.3 # Noise variances Y = F + np.random.randn(N, 1) * np.sqrt(NoiseVar) # Noisy data return X, Y, NoiseVar #known noise X, Y, NoiseVar = generate_data() Y_data = np.hstack([Y, NoiseVar]) likelihood = HeteroskedasticGaussian() kernel = gpflow.kernels.Matern52(lengthscales=0.5) model = gpflow.models.VGP((X, Y_data), kernel=kernel, likelihood=likelihood, num_latent_gps=1) natgrad = NaturalGradient(gamma=1.0) adam = tf.optimizers.Adam() set_trainable(model.q_mu, False) set_trainable(model.q_sqrt, False) for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) adam.minimize(model.training_loss, model.trainable_variables) for _ in range(ci_niter(1000)): natgrad.minimize(model.training_loss, [(model.q_mu, model.q_sqrt)]) adam.minimize(model.training_loss, model.trainable_variables) fig, ax = plt.subplots(1, 1, figsize=(12, 6)) _ = ax.errorbar(
def fit(self, X, Y, Xval, Yval): N = X.shape[0] if self.var_dist == "diag": q_diag = True elif self.var_dist == "full": q_diag = False else: raise NotImplementedError( "GPFlow cannot implement %s variational distribution" % (self.var_dist)) if self.do_classif: if self.num_classes == 2: likelihood = gpflow.likelihoods.Bernoulli() num_latent = 1 else: # Softmax better than Robustmax (apparently per the gpflow slack) #likelihood = gpflow.likelihoods.MultiClass(self.num_classes, invlink=invlink) # Multiclass likelihood likelihood = gpflow.likelihoods.Softmax(self.num_classes) num_latent = self.num_classes # Y must be 1D for the multiclass model to actually work. Y = np.argmax(Y, 1).reshape((-1, 1)).astype(int) else: num_latent = 1 likelihood = gpflow.likelihoods.Gaussian() self.model = SVGP(kernel=self.kernel, likelihood=likelihood, inducing_variable=self.Z, num_data=N, num_latent_gps=num_latent, whiten=False, q_diag=q_diag) # Setup training if not self.train_hyperparams: set_trainable(self.model.inducing_variable.Z, False) set_trainable(self.kernel.lengthscales, False) set_trainable(self.kernel.variance, False) if self.natgrad_lr > 0: set_trainable(self.model.q_mu, False) set_trainable(self.model.q_sqrt, False) variational_params = [(self.model.q_mu, self.model.q_sqrt)] # Create the optimizers adam_opt = tf.optimizers.Adam(self.lr) if self.natgrad_lr > 0: natgrad_opt = NaturalGradient(gamma=self.natgrad_lr) # Print gpflow.utilities.print_summary(self.model) print("", flush=True) # Giacomo: If shuffle buffer is too large it will run OOM if self.num_classes == 2: Y = (Y + 1) / 2 Yval = (Yval + 1) / 2 generator = partial(data_generator, X, Y) #train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)) \ train_dataset = tf.data.Dataset.from_generator(generator, args=(self.batch_size, ), output_types=(tf.float32, tf.float32)) \ .prefetch(self.batch_size * 10) \ .repeat() \ .shuffle(min(N // self.batch_size, 1_000_000 // self.batch_size)) \ .batch(1) train_iter = iter(train_dataset) loss = self.model.training_loss_closure(train_iter) t_elapsed = 0 for step in range(self.num_iter): t_s = time.time() if self.natgrad_lr > 0: natgrad_opt.minimize(loss, var_list=variational_params) adam_opt.minimize(loss, var_list=self.model.trainable_variables) t_elapsed += time.time() - t_s if step % 700 == 0: print("Step %d -- Elapsed %.2fs" % (step, t_elapsed), flush=True) if (step + 1) % self.error_every == 0: preds = self.predict(Xval) val_err, err_name = self.err_fn(Yval, preds) print( f"Step {step + 1} - {t_elapsed:7.2f}s Elapsed - " f"Validation {err_name} {val_err:7.5f}", flush=True) preds = self.predict(Xval) val_err, err_name = self.err_fn(Yval, preds) print( f"Finished optimization - {t_elapsed:7.2f}s Elapsed - " f"Validation {err_name} {val_err:7.5f}", flush=True) print("Final model is ") gpflow.utilities.print_summary(self.model) print("", flush=True) return self