def initialise_using_kernel_fun( kernel_fun: Callable[[tf.Tensor, tf.Tensor], tf.Tensor], Z: tf.Tensor, initial_mu: Optional[tf.Tensor] = None) \ -> InducingPointGPSpecification: initial_L = get_initial_values_from_kernel(Z, kernel_fun) if initial_mu is None: initial_mu = tf.zeros(Z.shape[0]) return InducingPointGPSpecification(mu=initial_mu, L_elts=initial_L, kernel_fun=kernel_fun, Z=Z)
def initialise_covariance_entries(kernel_creation_fun, flat_kernel_params, start_z): init_kernels = kernel_creation_fun(tf.constant(kernel_params, dtype=DTYPE)) start_cov_elts = list() for cur_kernel_fun in init_kernels: # Get the initial values cur_vals = get_initial_values_from_kernel(tf.constant(start_z, dtype=DTYPE), cur_kernel_fun, lo_tri=True) start_cov_elts.append(cur_vals) start_cov_elts = tf.stack(start_cov_elts, axis=0).numpy() return start_cov_elts
def get_mogp_initial_values(n_cov, n_latent, n_inducing): ms = tf.zeros((n_latent, n_inducing)) start_lscales = tf.random.uniform((n_latent, n_cov), minval=np.sqrt(2), maxval=np.sqrt(4)) # start_alphas = tf.random.uniform((n_latent,), minval=0.1, maxval=1.) start_alphas = tf.ones((n_latent,)) * tf.sqrt(0.1) start_kerns = [partial(matern_kernel_32, alpha=alpha, lengthscales=lscale, jitter=JITTER) for alpha, lscale in zip(start_alphas, start_lscales**2)] w_means = tf.random.normal((n_latent, n_species), stddev=0.01) w_vars = tf.ones((n_latent, n_species)) init_ls = [tf.constant(get_initial_values_from_kernel(cur_z, cur_kern)) for cur_z, cur_kern in zip(Z, start_kerns)] return (ms, start_lscales, start_alphas, start_kerns, w_means, w_vars, init_ls)
def fit(X: np.ndarray, y: np.ndarray, n_inducing: int = 100, n_latent: int = 10, kernel: str = 'matern_3/2', random_seed: int = 2): # TODO: This is copied from the mogp_classifier. # Maybe instead make it a function of some sort? np.random.seed(random_seed) # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] n_out = y.shape[1] # Set initial values start_lengthscales = np.random.uniform(2., 4., size=(n_latent, n_cov)) Z = find_starting_z(X, n_inducing) Z = np.tile(Z, (n_latent, 1, 1)) start_kernel_funs = get_kernel_funs(kernel_fun, np.sqrt(start_lengthscales)) init_Ls = np.stack([ get_initial_values_from_kernel(cur_z, cur_kernel_fun) for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs) ]) init_ms = np.zeros((n_latent, n_inducing)) start_prior_cov = np.eye(n_latent) start_prior_mean = np.zeros(n_latent) start_prior_cov_elts = corr_mogp.get_initial_w_elements( start_prior_mean, start_prior_cov, n_out) start_w_cov_elts = rep_vector(start_prior_cov_elts, n_out) init_w_means = np.random.randn(n_out, n_latent) start_theta = { 'mu': init_ms, 'L_elts': init_Ls, 'w_means': init_w_means, 'w_cov_elts': start_w_cov_elts, 'lengthscales': start_lengthscales, 'w_prior_cov_elts': start_prior_cov_elts, 'w_prior_mean': start_prior_mean, 'Z': Z } flat_start_theta, summary = flatten_and_summarise_tf(**start_theta) X_tf = tf.constant(X.astype(np.float32)) y_tf = tf.constant(y.astype(np.float32)) def extract_cov_matrices(theta): w_covs = create_pos_def_mat_from_elts_batch(theta['w_cov_elts'], n_latent, n_out, jitter=JITTER) Ls = mogp.create_ls(theta['L_elts'], n_inducing, n_latent) w_prior_cov = create_pos_def_mat_from_elts(theta['w_prior_cov_elts'], n_latent, jitter=JITTER) return w_covs, Ls, w_prior_cov def calculate_objective(theta): w_covs, Ls, w_prior_cov = extract_cov_matrices(theta) print(np.round(covar_to_corr(w_prior_cov.numpy()), 2)) print(np.round(theta['lengthscales'].numpy()**2, 2)) kernel_funs = get_kernel_funs(kernel_fun, theta['lengthscales']**2) cur_objective = corr_mogp.compute_default_objective( X_tf, y_tf, theta['Z'], theta['mu'], Ls, theta['w_means'], w_covs, kernel_funs, bernoulli_probit_lik, theta['w_prior_mean'], w_prior_cov) # Add prior lscale_prior = tfp.distributions.Gamma(3, 1 / 3).log_prob( theta['lengthscales']**2) return cur_objective + tf.reduce_sum(lscale_prior) def to_minimize(flat_theta): flat_theta = tf.constant(flat_theta) flat_theta = tf.cast(flat_theta, tf.float32) with tf.GradientTape() as tape: tape.watch(flat_theta) theta = reconstruct_tf(flat_theta, summary) objective = -calculate_objective(theta) grad = tape.gradient(objective, flat_theta) print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) result = minimize(to_minimize, flat_start_theta, jac=True, method='L-BFGS-B') final_theta = reconstruct_tf(result.x.astype(np.float32), summary) w_covs, Ls, w_prior_cov = extract_cov_matrices(final_theta) return CorrelatedMOGPResult( Ls=Ls, mu=final_theta['mu'].numpy(), kernel=kernel, lengthscales=final_theta['lengthscales'].numpy()**2, w_means=final_theta['w_means'].numpy(), w_cov=w_covs.numpy(), Z=final_theta['Z'].numpy(), w_prior_means=final_theta['w_prior_mean'].numpy(), w_prior_cov=w_prior_cov.numpy())
def fit( X: np.ndarray, y: np.ndarray, n_inducing: int = 100, n_latent: int = 10, kernel: str = "matern_3/2", # Gamma priors (note tfp uses "concentration rate" parameterisation): kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3), bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), w_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), # Normal priors w_mean_prior: Tuple[float, float] = (0, 1), bias_mean_prior: Tuple[float, float] = (0, 1), random_seed: int = 2, test_run: bool = False, total_kernel_variance=6.0, verbose=False, ) -> MOGPResult: np.random.seed(random_seed) # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] n_out = y.shape[1] # Set initial values start_lengthscales = np.random.uniform(2.0, 4.0, size=(n_latent, n_cov)).astype( np.float32 ) Z = find_starting_z(X, n_inducing) Z = np.tile(Z, (n_latent, 1, 1)) Z = Z.astype(np.float32) start_kernel_funs = get_kernel_funs( kernel_fun, tf.constant(start_lengthscales), total_variance=tf.constant(total_kernel_variance), ) init_Ls = np.stack( [ get_initial_values_from_kernel(tf.constant(cur_z), cur_kernel_fun) for cur_z, cur_kernel_fun in zip(Z, start_kernel_funs) ] ) init_ms = np.zeros((n_latent, n_inducing)) w_prior_var_init = np.ones((n_latent, 1)) * 1.0 w_prior_mean_init = np.zeros((n_latent, 1)) start_intercept_means = np.zeros(n_out) start_intercept_var = np.ones(n_out) intercept_prior_var_init = np.array(0.4) init_theta = { "L_elts": init_Ls, "mu": init_ms, "w_prior_var": w_prior_var_init, "w_prior_mean": w_prior_mean_init, "intercept_means": start_intercept_means, "intercept_vars": start_intercept_var, "intercept_prior_var": intercept_prior_var_init, "intercept_prior_mean": np.array(0.0), "w_means": np.random.randn(n_latent, n_out) * 0.01, "w_vars": np.ones((n_latent, n_out)), "lscales": np.sqrt(start_lengthscales), "Z": Z, } # Make same type init_theta = {x: tf.constant(y.astype(np.float32)) for x, y in init_theta.items()} flat_theta, summary = flatten_and_summarise_tf(**init_theta) X = tf.constant(X.astype(np.float32)) y = tf.constant(y.astype(np.float32)) lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior) bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior) w_var_prior = tfp.distributions.Gamma(*w_variance_prior) w_m_prior = tfp.distributions.Normal(*w_mean_prior) bias_m_prior = tfp.distributions.Normal(*bias_mean_prior) # TODO: Think about priors for W? def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) # Square the important parameters (lscales, w_prior_var, intercept_vars, intercept_prior_var, w_vars) = ( theta["lscales"] ** 2, theta["w_prior_var"] ** 2, theta["intercept_vars"] ** 2, theta["intercept_prior_var"] ** 2, theta["w_vars"] ** 2, ) if verbose: print(lscales) print(intercept_prior_var) print(w_prior_var) print(theta["w_prior_mean"]) print(theta["intercept_prior_mean"]) Ls = create_ls(theta["L_elts"], n_inducing, n_latent) kern_funs = get_kernel_funs( kernel_fun, lscales, total_variance=tf.constant(total_kernel_variance, dtype=tf.float32), ) kl = compute_kl_term( theta["mu"], Ls, kern_funs, theta["Z"], theta["w_means"], w_vars, theta["w_prior_mean"], w_prior_var, theta["intercept_means"], intercept_vars, theta["intercept_prior_mean"], intercept_prior_var, ) lik = compute_likelihood_term( X, y, theta["Z"], theta["mu"], Ls, kern_funs, theta["w_means"], w_vars, theta["intercept_means"], intercept_vars, ) objective = -(lik - kl) objective = objective - ( tf.reduce_sum(lscale_prior.log_prob(lscales)) + bias_var_prior.log_prob(intercept_prior_var) + tf.reduce_sum(w_var_prior.log_prob(w_prior_var)) + bias_m_prior.log_prob(theta["intercept_prior_mean"]) + tf.reduce_sum(w_m_prior.log_prob(theta["w_prior_mean"])) ) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) if test_run: additional_args = {"tol": 1} else: additional_args = {} result = minimize( to_minimize_with_grad, flat_theta, jac=True, method="L-BFGS-B", **additional_args ) final_theta = reconstruct_tf(result.x, summary) final_theta = {x: tf.cast(y, tf.float32) for x, y in final_theta.items()} # Build the results fit_result = MOGPResult( L_elts=final_theta["L_elts"], mu=final_theta["mu"], kernel=kernel, lengthscales=final_theta["lscales"] ** 2, intercept_means=final_theta["intercept_means"], intercept_vars=final_theta["intercept_vars"] ** 2, w_means=final_theta["w_means"], w_vars=final_theta["w_vars"] ** 2, Z=final_theta["Z"], w_prior_means=final_theta["w_prior_mean"], w_prior_vars=final_theta["w_prior_var"] ** 2, intercept_prior_mean=final_theta["intercept_prior_mean"], intercept_prior_var=final_theta["intercept_prior_var"] ** 2, total_kernel_variance=tf.constant(total_kernel_variance, tf.float32), ) return fit_result
def fit( X: np.ndarray, y: np.ndarray, n_inducing: int = 100, kernel: str = "matern_3/2", # Gamma priors (note tfp uses "concentration rate" parameterisation): kernel_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), kernel_lengthscale_prior: Tuple[float, float] = (3, 1 / 3), bias_variance_prior: Tuple[float, float] = (3 / 2, 3 / 2), random_seed: int = 2, verbose: bool = False, ) -> SOGPResult: np.random.seed(random_seed) assert kernel in [ "matern_3/2", "matern_1/2", "rbf", ], "Only these three kernels are currently supported!" # Note that input _must_ be scaled. Some way to enforce that? kernel_fun = kern_lookup[kernel] n_cov = X.shape[1] # Set initial values start_alpha = np.array(1.0, dtype=np.float32) start_lengthscales = np.random.uniform(2.0, 4.0, size=n_cov).astype(np.float32) start_bias_sd = np.array(1.0, dtype=np.float32) Z = find_starting_z(X, n_inducing).astype(np.float32) start_kernel_fun = get_kernel_fun(kernel_fun, start_alpha, start_lengthscales, start_bias_sd) init_L = get_initial_values_from_kernel(Z, start_kernel_fun) init_mu = np.zeros(n_inducing, dtype=np.float32) init_theta = { "L_elts": init_L, "mu": init_mu, "alpha": start_alpha, "lscales": np.sqrt(start_lengthscales), "Z": Z, "bias_sd": start_bias_sd, } flat_theta, summary = flatten_and_summarise_tf(**init_theta) X = tf.constant(X.astype(np.float32)) y = tf.constant(y.astype(np.float32)) lscale_prior = tfp.distributions.Gamma(*kernel_lengthscale_prior) kernel_var_prior = tfp.distributions.Gamma(*kernel_variance_prior) bias_var_prior = tfp.distributions.Gamma(*bias_variance_prior) def to_minimize_with_grad(x): with tf.GradientTape() as tape: x_tf = tf.constant(x) x_tf = tf.cast(x_tf, tf.float32) tape.watch(x_tf) theta = reconstruct_tf(x_tf, summary) alpha, lscales, bias_sd = ( theta["alpha"]**2, theta["lscales"]**2, theta["bias_sd"]**2, ) L_cov = lo_tri_from_elements(theta["L_elts"], n_inducing) kern_fun = get_kernel_fun(kernel_fun, alpha, lscales, bias_sd) objective = -compute_objective(X, y, theta["mu"], L_cov, theta["Z"], bernoulli_probit_lik, kern_fun) objective = objective - (tf.reduce_sum( lscale_prior.log_prob(lscales)) + kernel_var_prior.log_prob( alpha**2) + bias_var_prior.log_prob(bias_sd**2)) grad = tape.gradient(objective, x_tf) if verbose: print(objective, np.linalg.norm(grad.numpy())) return (objective.numpy().astype(np.float64), grad.numpy().astype(np.float64)) result = minimize(to_minimize_with_grad, flat_theta, jac=True, method="L-BFGS-B") final_theta = reconstruct_tf(result.x, summary) final_theta = { x: y.numpy().astype(np.float32) for x, y in final_theta.items() } # Build the results fit_result = SOGPResult( L_elts=final_theta["L_elts"], mu=final_theta["mu"], kernel=kernel, lengthscales=final_theta["lscales"]**2, alpha=final_theta["alpha"]**2, bias_sd=final_theta["bias_sd"]**2, Z=final_theta["Z"], ) return fit_result