def test_optimize(self): with defer_build(): input_layer = InputLayer(input_dim=1, output_dim=1, num_inducing=self.M, kernel=RBF(1) + White(1), multitask=True) output_layer = OutputLayer(input_dim=1, output_dim=1, num_inducing=self.M, kernel=RBF(1) + White(1), multitask=True) seq = MultitaskSequential([input_layer, output_layer]) model = MultitaskDSDGP(X=self.X, Y=self.Y, Z=self.Z, layers=seq, likelihood=SwitchedLikelihood( [Gaussian(), Gaussian()]), num_latent=1) model.compile() before = model.compute_log_likelihood() opt = gpflow.train.AdamOptimizer(0.01) opt.minimize(model, maxiter=100) after = model.compute_log_likelihood() self.assertGreaterEqual(after, before)
def test_latent_kernels(): kernel_list = [SquaredExponential(), White(), White() + Linear()] multioutput_kernel_list = [ SharedIndependent(SquaredExponential(), 3), SeparateIndependent(kernel_list), LinearCoregionalization(kernel_list, np.random.random((5, 3))), ] assert len(multioutput_kernel_list[0].latent_kernels) == 1 assert multioutput_kernel_list[1].latent_kernels == tuple(kernel_list) assert multioutput_kernel_list[2].latent_kernels == tuple(kernel_list)
def make_DGP(L, D_problem, D_hidden, X, Y, Z): kernels = [] # First layer kernels.append(RBF(D_problem, lengthscales=0.2, variance=1.) + White(D_problem, variance=1e-5)) for l in range(L-1): k = RBF(D_hidden, lengthscales=0.2, variance=1.) + White(D_hidden, variance=1e-5) kernels.append(k) m_dgp = DGP(X, Y, Z, kernels, Gaussian(), num_samples=10) # init the layers to near determinisic for layer in m_dgp.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return m_dgp
def make_dgp(X, Y, Z, L): D = X.shape[1] Y_mean, Y_std = np.average(Y), np.std(Y) # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(D, lengthscales=1., variance=1.)) # between layer noise (doesn't actually make much difference but we include it anyway) for kernel in kernels[:-1]: kernel += White(D, variance=1e-5) mb = 10000 if X.shape[0] > 10000 else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # same final layer inits we used for the single layer model model.layers[-1].kern.variance = Y_std**2 model.likelihood.variance = Y_std * 0.1 model.layers[-1].mean_function = Constant(Y_mean) model.layers[-1].mean_function.fixed = True # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def residual_kernel(K_Y: np.ndarray, K_X: np.ndarray, use_expectation=True, with_gp=True, sigma_squared=1e-3, return_learned_K_X=False): """Kernel matrix of residual of Y given X based on their kernel matrices, Y=f(X)""" import gpflow from gpflow.kernels import White, Linear from gpflow.models import GPR K_Y, K_X = centering(K_Y), centering(K_X) T = len(K_Y) if with_gp: eig_Ky, eiy = truncated_eigen(*eigdec(K_Y, min(100, T // 4))) eig_Kx, eix = truncated_eigen(*eigdec(K_X, min(100, T // 4))) X = eix @ diag(sqrt(eig_Kx)) # X @ X.T is close to K_X Y = eiy @ diag(sqrt(eig_Ky)) n_feats = X.shape[1] linear = Linear(n_feats, ARD=True) white = White(n_feats) gp_model = GPR(X, Y, linear + white) gpflow.train.ScipyOptimizer().minimize(gp_model) K_X = linear.compute_K_symm(X) sigma_squared = white.variance.value P = pdinv(np.eye(T) + K_X / sigma_squared) # == I-K @ inv(K+Sigma) in Zhang et al. 2011 if use_expectation: # Flaxman et al. 2016 Gaussian Processes for Independence Tests with Non-iid Data in Causal Inference. RK = (K_X + P @ K_Y) @ P else: # Zhang et al. 2011. Kernel-based Conditional Independence Test and Application in Causal Discovery. RK = P @ K_Y @ P if return_learned_K_X: return RK, K_X else: return RK
def regression_distance_k(Kx: np.ndarray, Ky: np.ndarray): warnings.warn('not tested yet!') import gpflow from gpflow.kernels import White, Linear from gpflow.models import GPR T = len(Kx) eig_Ky, eiy = truncated_eigen(*eigdec(Ky, min(100, T // 4))) eig_Kx, eix = truncated_eigen(*eigdec(Kx, min(100, T // 4))) X = eix @ diag(sqrt(eig_Kx)) # X @ X.T is close to K_X Y = eiy @ diag(sqrt(eig_Ky)) n_feats = X.shape[1] linear = Linear(n_feats, ARD=True) white = White(n_feats) gp_model = GPR(X, Y, linear + white) gpflow.train.ScipyOptimizer().minimize(gp_model) Kx = linear.compute_K_symm(X) sigma_squared = white.variance.value P = Kx @ pdinv(Kx + sigma_squared * np.eye(T)) M = P @ Ky @ P O = np.ones((T, 1)) N = O @ np.diag(M).T D = np.sqrt(N + N.T - 2 * M) return D
def temporal_kernel(self): kernel = White(variance=self.model_config['noise_inner']) m_inds = list(range(self.m)) # Initialize a non-linear kernels over inputs if self.model_config['input_nonlinear']: scales = [self.model_config['scale'] ] * self.m if self.model_config[ 'scale_tie'] else self.model_config['scale'] if self.model_config['rq']: kernel += RationalQuadratic(active_dims=m_inds, variance=1.0, lengthscales=scales, alpha=1e-2) else: kernel += SquaredExponential(active_dims=m_inds, variance=1.0, lengthscales=scales) # Add a periodic kernel over inputs # Decay????? if self.model_config['per']: scales = [self.model_config['per_scale']] * self.m periods = [self.model_config['per_period']] * self.m base_kernel = SquaredExponential(active_dims=m_inds, variance=1.0, lengthscales=scales) kernel += Periodic(base_kernel, period=periods) # Add a linear kernel over inputs if self.model_config['input_linear']: variances = [self.model_config['input_linear_scale']] * self.m kernel += LinearKernel(active_dims=m_inds, variance=variances) return kernel
def prepare(self): N = 100 M = 10 rng = np.random.RandomState(42) X = rng.randn(N, 2) Y = rng.randn(N, 1) Z = rng.randn(M, 2) X_ind = rng.randint(0, 2, (N, 1)) Z_ind = rng.randint(0, 2, (M, 1)) X = np.hstack([X, X_ind]) Y = np.hstack([Y, X_ind]) Z = np.hstack([Z, Z_ind]) Xs = rng.randn(M, 2) Xs_ind = rng.randint(0, 2, (M, 1)) Xs = np.hstack([Xs, Xs_ind]) with defer_build(): lik = SwitchedLikelihood([Gaussian(), Gaussian()]) input_layer = InputLayer(input_dim=2, output_dim=1, num_inducing=M, kernel=RBF(2) + White(2), mean_function=Linear(A=np.ones((3, 1))), multitask=True) output_layer = OutputLayer(input_dim=1, output_dim=1, num_inducing=M, kernel=RBF(1) + White(1), multitask=True) seq = MultitaskSequential([input_layer, output_layer]) model = MultitaskDSDGP(X=X, Y=Y, Z=Z, layers=seq, likelihood=lik, num_latent=1) model.compile() return model, Xs
def __init__(self, X, Y, inducing_points, final_inducing_points, hidden_units, units, share_inducing_inputs=True): Model.__init__(self) assert X.shape[0] == Y.shape[0] self.num_data, D_X = X.shape self.D_Y = 1 self.num_samples = 100 kernels = [] for l in range(hidden_units + 1): ks = [] if (l > 0): D = units else: D = D_X if (l < hidden_units): for w in range(units): ks.append( RBF(D, lengthscales=1., variance=1.) + White(D, variance=1e-5)) else: ks.append(RBF(D, lengthscales=1., variance=1.)) kernels.append(ks) self.dims_in = [D_X] + [units] * hidden_units self.dims_out = [units] * hidden_units + [1] q_mus, q_sqrts, Zs, mean_functions = init_layers( X, self.dims_in, self.dims_out, inducing_points, final_inducing_points, share_inducing_inputs) layers = [] for q_mu, q_sqrt, Z, mean_function, kernel in zip( q_mus, q_sqrts, Zs, mean_functions, kernels): layers.append(Layer(kernel, q_mu, q_sqrt, Z, mean_function)) self.layers = ParamList(layers) for layer in self.layers[:-1]: # fix the inner layer mean functions layer.mean_function.fixed = True self.likelihood = Gaussian() minibatch_size = 10000 if X.shape[0] > 10000 else None if minibatch_size is not None: self.X = MinibatchData(X, minibatch_size) self.Y = MinibatchData(Y, minibatch_size) else: self.X = DataHolder(X) self.Y = DataHolder(Y)
def make_mf_dgp(cls, X, Y, Z, add_linear=True, minibatch_size=None): """ Constructor for convenience. Constructs a mf-dgp model from training data and inducing point locations :param X: List of target :param Y: :param Z: :param add_linear: :return: """ n_fidelities = len(X) Din = X[0].shape[1] Dout = Y[0].shape[1] kernels = [RBF(Din, active_dims=list(range(Din)), variance=1., lengthscales=1, ARD=True)] for l in range(1, n_fidelities): D = Din + Dout D_range = list(range(D)) k_corr = RBF(Din, active_dims=D_range[:Din], lengthscales=1, variance=1.0, ARD=True) k_prev = RBF(Dout, active_dims=D_range[Din:], variance=1., lengthscales=1.0) k_in = RBF(Din, active_dims=D_range[:Din], variance=1., lengthscales=1, ARD=True) if add_linear: k_l = k_corr * (k_prev + Linear(Dout, active_dims=D_range[Din:], variance=1.)) + k_in else: k_l = k_corr * k_prev + k_in kernels.append(k_l) """ A White noise kernel is currently expected by Mf-DGP at all layers except the last. In cases where no noise is desired, this should be set to 0 and fixed, as follows: white = White(1, variance=0.) white.variance.trainable = False kernels[i] += white """ for i, kernel in enumerate(kernels[:-1]): kernels[i] += White(1, variance=1e-6) num_data = 0 for i in range(len(X)): _log.info('\nData at Fidelity {}'.format(i + 1)) _log.info('X - {}'.format(X[i].shape)) _log.info('Y - {}'.format(Y[i].shape)) _log.info('Z - {}'.format(Z[i].shape)) num_data += X[i].shape[0] layers = init_layers_mf(Y, Z, kernels, num_outputs=Dout) model = DGP_Base(X, Y, Gaussian(), layers, num_samples=10, minibatch_size=minibatch_size) return model
def test_contructor(self): input_layer = InputLayer(input_dim=1, output_dim=1, num_inducing=self.M, kernel=RBF(1) + White(1)) output_layer = OutputLayer(input_dim=1, output_dim=1, num_inducing=self.M, kernel=RBF(1) + White(1)) seq = Sequential([input_layer, output_layer]) try: model = DSDGP(X=self.X, Y=self.Y, Z=self.Z, layers=seq, likelihood=Gaussian()) except Exception as e: print(e) self.fail('DSDGP contructor fails')
def make_deep_GP(num_layers, X, Y, Z): kernels = [] layer_sizes = [] for l in range(num_layers): kernel = RBF(lengthscale=0.2, variance=1.0) + White(variance=1e-5) kernels.append(kernel) layer_sizes.append(1) dgp = DeepGP(X, Y, Z, kernels, layer_sizes, Gaussian(), num_samples=100) # init hidden layers to be near deterministic for layer in dgp.layers[:-1]: layer.q_sqrt.assign(layer.q_sqrt * 1e-5) return dgp
def prepare(self): N = 100 M = 10 rng = np.random.RandomState(42) X = rng.randn(N, 2) Y = rng.randn(N, 1) Z = rng.randn(M, 2) Xs = rng.randn(M, 2) lik = Gaussian() input_layer = InputLayer(input_dim=2, output_dim=1, num_inducing=M, kernel=RBF(2) + White(2), mean_function=Linear(A=np.ones((2, 1)))) output_layer = OutputLayer(input_dim=1, output_dim=1, num_inducing=M, kernel=RBF(1) + White(1)) seq = Sequential([input_layer, output_layer]) model = DSDGP(X=X, Y=Y, Z=Z, layers=seq, likelihood=lik) model.compile() return model, Xs
def compute_residual_eig(Y: np.ndarray, Kx: np.ndarray) -> np.ndarray: """Residual of Y based on Kx, a kernel matrix of X""" assert len(Y) == len(Kx) eig_Kx, eix = truncated_eigen(*eigdec(Kx, min(100, len(Kx) // 4))) phi_X = eix @ np.diag(np.sqrt(eig_Kx)) # X @ X.T is close to K_X n_feats = phi_X.shape[1] linear_kernel = Linear(n_feats, ARD=True) gp_model = GPR(phi_X, Y, linear_kernel + White(n_feats)) gp_model.optimize() new_Kx = linear_kernel.compute_K_symm(phi_X) sigma_squared = gp_model.kern.white.variance.value[0] return (pdinv(np.eye(len(Kx)) + new_Kx / sigma_squared) @ Y).squeeze()
def residual_kernel_matrix_kernel_real(Kx, Z, num_eig, ARD=True): """K_X|Z""" assert len(Kx) == len(Z) assert num_eig <= len(Kx) T = len(Kx) D = Z.shape[1] I = eye(T) eig_Kx, eix = truncated_eigen(*eigdec(Kx, num_eig)) rbf = RBF(D, ARD=ARD) white = White(D) gp_model = GPR(Z, 2 * sqrt(T) * eix @ diag(sqrt(eig_Kx)) / sqrt(eig_Kx[0]), rbf + white) gpflow.train.ScipyOptimizer().minimize(gp_model) sigma_squared = white.variance.value Kz_x = rbf.compute_K_symm(Z) P = I - Kz_x @ pdinv(Kz_x + sigma_squared * I) return P @ Kx @ P.T
def regression_distance(Y: np.ndarray, Z: np.ndarray, ard=True): """d(z,z') = |f(z)-f(z')| where Y=f(Z) + noise and f ~ GP""" import gpflow from gpflow.kernels import White, RBF from gpflow.models import GPR n, dims = Z.shape rbf = RBF(dims, ARD=ard) rbf_white = rbf + White(dims) gp_model = GPR(Z, Y, rbf_white) gpflow.train.ScipyOptimizer().minimize(gp_model) Kz_y = rbf.compute_K_symm(Z) Ry = pdinv(rbf_white.compute_K_symm(Z)) Fy = Y.T @ Ry @ Kz_y # F(z) M = Fy.T @ Fy O = np.ones((n, 1)) N = O @ (np.diag(M)[:, None]).T D = np.sqrt(N + N.T - 2 * M) return D, Kz_y
def _kernels_generator(self): def _determine_indicies(m, pi, markov): # Build in the Markov structure: juggle with the indices of the outputs. p_last = pi - 1 # Index of last output that is given as input. p_start = 0 if markov is None else max(p_last - (markov - 1), 0) p_num = p_last - p_start + 1 # Determine the indices corresponding to the outputs and inputs. m_inds = list(range(m)) p_inds = list(range(m + p_start, m + p_last + 1)) return m_inds, p_inds, p_num kernels = [] for pi in range(self.num_outputs): m_inds, p_inds, p_num = _determine_indicies( self.m, pi, self.model_config['markov']) # Construct inner-layers noise kernel kernel = White(variance=self.model_config['noise_inner']) # Initialize a non-linear kernels over inputs #if pi==0: scales = [self.model_config['scale'] ] * self.m if self.model_config[ 'scale_tie'] else self.model_config['scale'] if self.model_config['rq']: kernel += RationalQuadratic(active_dims=m_inds, variance=1.0, lengthscales=scales, alpha=1e-2) else: kernel += SquaredExponential(active_dims=m_inds, variance=1.0, lengthscales=scales) # Add a periodic kernel over inputs # Decay????? if self.model_config['per']: scales = [self.model_config['per_scale']] * self.m periods = [self.model_config['per_period']] * self.m base_kernel = SquaredExponential(active_dims=m_inds, variance=1.0, lengthscales=scales) kernel += Periodic(base_kernel, period=periods) # Add a linear kernel over inputs if self.model_config['input_linear']: variances = [self.model_config['input_linear_scale']] * self.m kernel += LinearKernel(active_dims=m_inds, variance=variances) # Add a linear kernel over outputs if self.model_config['linear'] and pi > 0: variances = [self.model_config['linear_scale']] * p_num kernel += LinearKernel(active_dims=p_inds, variance=variances) # Add a non-linear kernel over outputs if self.model_config['nonlinear'] and pi > 0: if self.model_config['nonlinear_dependent']: active_dims = m_inds.extend(p_inds) scales = [self.model_config['scale']] * self.m scales.extend([self.model_config['nonlinear_scale']] * p_num) else: active_dims = p_inds scales = [self.model_config['nonlinear_scale']] * p_num if self.model_config['rq']: kernel += RationalQuadratic(active_dims=active_dims, variance=1.0, lengthscales=scales, alpha=1e-2) else: kernel += SquaredExponential(active_dims=active_dims, variance=1.0, lengthscales=scales) kernels.append(kernel) return kernels
def prepare_model(name, X, y, Z, num_samples_train=5, minibatch=None, M=30, small_architecture=True): """ Initialize three layer deep GPs with different architectures, variational families, and inference methods. name can be one of {'fc', 'star', 'mf', 'fc_sampled'} and gives the fully-coupled, stripes-and-arrow, or mean-field dgp with analytical marginalisation of the inducing outputs, or the fully-coupled dgp with marginalisation by Monte Carlo sampling, respectively. The variational parameters are initialized as described in e.g. https://github.com/ICL-SML/Doubly-Stochastic-DGP/blob/master/demos/demo_regression_UCI.ipynb making the training more effective in the beginning. """ #prepare the kernels (3 layers) #use rbf kernels in all layers and additionally white noise kernels in all but the last layer #disable training the variance of the rbf kernel in the intermediate layers #if small_architecture=True 2 GPs in both hidden layers, otherwise 5 dim_X = X.shape[1] k = RBF(dim_X, ARD=True, lengthscales=1) k.variance.set_trainable(False) k += White(dim_X, variance=1e-3) Ks = [k] if small_architecture: k = RBF(2, ARD=True, lengthscales=1) k.variance.set_trainable(False) k += White(2, variance=1e-3) Ks += [k, RBF(2, ARD=True, lengthscales=1)] else: k = RBF(5, ARD=True, lengthscales=1) k.variance.set_trainable(False) k += White(5, variance=1e-3) Ks += [k, RBF(5, ARD=True, lengthscales=1)] assert name in ['fc', 'star', 'mf', 'fc_sampled'], 'Unknown name of dgp model used' if name == 'fc': #fully-coupled model = Full_DGP(X, y, Z.copy(), Ks.copy(), Gaussian(0.01), minibatch_size=minibatch, num_samples=num_samples_train) elif name == 'star': #stripes-and-arrow model = Fast_Approx_Full_DGP(X, y, Z.copy(), Ks.copy(), Gaussian(0.01), stripes=True, arrow=True, minibatch_size=minibatch, num_samples=num_samples_train) elif name == 'mf': #mean-field model = Mean_Field_DGP(X, y, Z.copy(), Ks.copy(), Gaussian(0.01), minibatch_size=minibatch, num_samples=num_samples_train) elif name == 'fc_sampled': #fully-coupled with marginalisation by Monte Carlo sampling model = Full_DGP_Sampled(X, y, Z.copy(), Ks.copy(), Gaussian(0.01), minibatch_size=minibatch, num_samples=num_samples_train) if name in ['fc', 'fc_sampled']: #start the inner layers almost deterministically, #this is done by default for mf and star dgp SM_prior = model.layers.S_M_sqrt.value SM_det = block_diag(SM_prior[0, :-M, :-M] * 1e-5, SM_prior[0, -M:, -M:]) model.layers.S_M_sqrt = [SM_det] return model
def default_gp_kernel(X: np.ndarray): from gpflow.kernels import White, RBF _, n_feats = X.shape return RBF(n_feats, ARD=True) + White(n_feats)
def run_gp_optim(target_column: str, split_perc: float, imputation: str, featureset: str): """ Run whole GPR optimization loop :param target_column: target variable for predictions :param split_perc: percentage of samples to use for train set :param imputation: imputation method for missing values :param featureset: featureset to use """ config = configparser.ConfigParser() config.read('Configs/dataset_specific_config.ini') # get optim parameters base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \ TrainHelper.get_optimization_run_parameters(config=config, target_column=target_column, split_perc=split_perc) # load datasets datasets = TrainHelper.load_datasets(config=config, target_column=target_column) # prepare parameter grid kernels = [] base_kernels = [ SquaredExponential(), Matern52(), White(), RationalQuadratic(), Polynomial() ] for kern in base_kernels: if isinstance(kern, IsotropicStationary): base_kernels.append(Periodic(kern, period=seasonal_periods)) TrainHelper.extend_kernel_combinations(kernels=kernels, base_kernels=base_kernels) param_grid = { 'dataset': datasets, 'imputation': [imputation], 'featureset': [featureset], 'dim_reduction': ['None', 'pca'], 'kernel': kernels, 'mean_function': [None, gpflow.mean_functions.Constant()], 'noise_variance': [0.01, 1, 10, 100], 'optimizer': [gpflow.optimizers.Scipy()], 'standardize_x': [False, True], 'standardize_y': [False, True], 'osa': [True] } # random sample from parameter grid params_lst = TrainHelper.random_sample_parameter_grid( param_grid=param_grid, sample_share=0.2) doc_results = None best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = 'Dummy' imputation_last = 'Dummy' dim_reduction_last = 'Dummy' featureset_last = 'Dummy' for i in tqdm(range(len(params_lst))): warnings.simplefilter('ignore') dataset = params_lst[i]['dataset'] imputation = params_lst[i]['imputation'] featureset = params_lst[i]['featureset'] dim_reduction = None if params_lst[i][ 'dim_reduction'] == 'None' else params_lst[i]['dim_reduction'] # deepcopy to prevent impact of previous optimizations kernel = gpflow.utilities.deepcopy(params_lst[i]['kernel']) mean_fct = gpflow.utilities.deepcopy(params_lst[i]['mean_function']) noise_var = params_lst[i]['noise_variance'] optimizer = gpflow.utilities.deepcopy(params_lst[i]['optimizer']) stand_x = params_lst[i]['standardize_x'] stand_y = params_lst[i]['standardize_y'] one_step_ahead = params_lst[i]['osa'] # dim_reduction only done without NaNs if imputation is None and dim_reduction is not None: continue # dim_reduction does not make sense for few features if featureset == 'none' and dim_reduction is not None: continue if not ((dataset.name == dataset_last_name) and (imputation == imputation_last) and (dim_reduction == dim_reduction_last) and (featureset == featureset_last)): if resample_weekly and 'weekly' not in dataset.name: dataset.name = dataset.name + '_weekly' print(dataset.name + ' ' + str('None' if imputation is None else imputation) + ' ' + str('None' if dim_reduction is None else dim_reduction) + ' ' + featureset + ' ' + target_column) train_test_list = TrainHelper.get_ready_train_test_lst( dataset=dataset, config=config, init_train_len=init_train_len, test_len=test_len, split_perc=split_perc, imputation=imputation, target_column=target_column, dimensionality_reduction=dim_reduction, featureset=featureset) if dataset.name != dataset_last_name: best_rmse = 5000000.0 best_mape = 5000000.0 best_smape = 5000000.0 dataset_last_name = dataset.name imputation_last = imputation dim_reduction_last = dim_reduction featureset_last = featureset kernel_string, mean_fct_string, optimizer_string = get_docresults_strings( kernel=kernel, mean_function=mean_fct, optimizer=optimizer) sum_dict = None try: for train, test in train_test_list: model = ModelsGPR.GaussianProcessRegressionGPFlow( target_column=target_column, seasonal_periods=seasonal_periods, kernel=kernel, mean_function=mean_fct, noise_variance=noise_var, optimizer=optimizer, standardize_x=stand_x, standardize_y=stand_y, one_step_ahead=one_step_ahead) cross_val_dict = model.train(train=train, cross_val_call=False) eval_dict = model.evaluate(train=train, test=test) eval_dict.update(cross_val_dict) if sum_dict is None: sum_dict = eval_dict else: for k, v in eval_dict.items(): sum_dict[k] += v evaluation_dict = { k: v / len(train_test_list) for k, v in sum_dict.items() } params_dict = { 'dataset': dataset.name, 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': model.model.parameters } save_dict = params_dict.copy() save_dict.update(evaluation_dict) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) best_rmse, best_mape, best_smape = TrainHelper.print_best_vals( evaluation_dict=evaluation_dict, best_rmse=best_rmse, best_mape=best_mape, best_smape=best_smape, run_number=i) except KeyboardInterrupt: print('Got interrupted') break except Exception as exc: # print(exc) params_dict = { 'dataset': 'Failure', 'featureset': featureset, 'imputation': str('None' if imputation is None else imputation), 'dim_reduction': str('None' if dim_reduction is None else dim_reduction), 'init_train_len': init_train_len, 'test_len': test_len, 'split_perc': split_perc, 'kernel': kernel_string, 'mean_function': mean_fct_string, 'noise_variance': noise_var, 'optimizer': optimizer_string, 'standardize_x': stand_x, 'standardize_y': stand_y, 'one_step_ahead': one_step_ahead, 'optim_mod_params': 'failed' } save_dict = params_dict.copy() save_dict.update(TrainHelper.get_failure_eval_dict()) if doc_results is None: doc_results = pd.DataFrame(columns=save_dict.keys()) doc_results = doc_results.append(save_dict, ignore_index=True) TrainHelper.save_csv_results(doc_results=doc_results, save_dir=base_dir + 'OptimResults/', company_model_desc='gpr', target_column=target_column, seasonal_periods=seasonal_periods, datasets=datasets, featuresets=param_grid['featureset'], imputations=param_grid['imputation'], split_perc=split_perc) print('Optimization Done. Saved Results.')
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = args.results_dir + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') # get dataset data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] # inducing points via k-means Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] dims = [] # hidden_dim = min(args.max_dim, X.shape[1]) hidden_dim = X.shape[1] if X.shape[1] < args.max_dim else args.max_dim for l in range(args.num_layers): if l == 0: dim = X.shape[1] dims.append(dim) else: dim = hidden_dim dims.append(dim) if args.ard: # SE kernel with lengthscale per dimension kernels.append( SquaredExponential(lengthscale=[1.] * dim) + White(variance=1e-5)) else: # SE kernel with single lengthscale kernels.append( SquaredExponential(lengthscale=1.) + White(variance=1e-5)) # output dim dims.append(Y.shape[1]) dgp_model = DGP(X, Y, Z, dims, kernels, Gaussian(variance=0.05), num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) # ===================================================================== # TRAINING # ===================================================================== optimiser = tf.optimizers.Adam(args.learning_rate) print('Training DGP model...') t0 = time.time() # training loop monitored_training_loop(dgp_model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :], num_samples=args.test_samples) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) means.append(m) vars.append(v) mean_SND = np.concatenate(means, 1) # [S, N, D] var_SND = np.concatenate(vars, 1) # [S, N, D] mean_ND = np.mean(mean_SND, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, mean_SND * Y_std, var_SND**0.5 * Y_std), 0, b=1 / float(args.test_samples))) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
def get_test_error(i, dataset, alpha, learning_rate=0.001, iterations=20000, white=True, normalized=True, num_inducing=100, beta=None, gamma=None, div_weights=None): """STEP (1) Read in the data via the helpful 'Dataset' object""" data = datasets.all_datasets[dataset].get_data(seed=0, split=i, prop=0.9) X_train, Y_train, X_test, Y_test, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] print('N: {}, D: {}, Ns: {}, Y_std: {}'.format(X_train.shape[0], X_train.shape[1], X_test.shape[0], Y_std)) Z = kmeans2(X_train, num_inducing, minit='points')[0] #Dimensionality of X D = X_train.shape[1] # the layer shapes are defined by the kernel dims, so here all # hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(D)) # between layer noise (doesn't actually make much difference but we include it anyway) for kernel in kernels[:-1]: kernel += White(D, variance=1e-5) mb = 1000 if X_train.shape[0] > 1000 else None # get the likelihood model (possibly a robust one) if gamma is None and beta is None: #standard likelihood lklh = Gaussian() elif beta is not None and gamma is None: #beta-divergence robustified likelihood lklh = betaDivGaussian(beta) elif gamma is not None and beta is None: #gamma-divergeece robustified likelihood lklh = gammaDivGaussian(gamma) else: print( "ERROR! You have specified both beta and gamma. Either specify " + "both as None (for standard Gaussian likelihood) or one of them " + "as None (to use the other)") sys.exit() """STEP (2): Call 'DGP' for split i, which together with ADAM is responsible for the inference""" model = DGP( X_train, Y_train, Z, kernels, lklh, #Gaussian(), #betaDivGaussian(0.01), #Gaussian(), #betaDivGaussian(0.1), #Gaussian(), #Gaussian_(), #gammaDivGaussian(0.1), #Gaussian_(), #gammaDivGaussian(0.01), #gammaDivGaussian(0.1), #Gaussian(), num_samples=K, minibatch_size=mb, alpha=alpha, white=white, div_weights=div_weights) # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 #Build functions for evaluation test errors S = 100 def batch_assess(model, assess_model, X, Y): n_batches = max(int(X.shape[0] / 1000.), 1) lik, sq_diff = [], [] for X_batch, Y_batch in zip(np.array_split(X, n_batches), np.array_split(Y, n_batches)): l, sq = assess_model(model, X_batch, Y_batch) lik.append(l) sq_diff.append(sq) lik = np.concatenate(lik, 0) sq_diff = np.array(np.concatenate(sq_diff, 0), dtype=float) return np.average(lik), np.average(sq_diff)**0.5 def assess_single_layer(model, X_batch, Y_batch): m, v = model.predict_y(X_batch) lik = np.sum( norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5), 1) sq_diff = Y_std**2 * ((m - Y_batch)**2) return lik, sq_diff def assess_sampled(model, X_batch, Y_batch): m, v = model.predict_y(X_batch, S) S_lik = np.sum( norm.logpdf(Y_batch * Y_std, loc=m * Y_std, scale=Y_std * v**0.5), 2) lik = logsumexp(S_lik, 0, b=1 / float(S)) mean = np.average(m, 0) sq_diff = Y_std**2 * ((mean - Y_batch)**2) return lik, sq_diff #Get start time start_time = time.time() #Fit to training set via ADAM np.random.seed(1) AdamOptimizer(learning_rate).minimize(model, maxiter=iterations) #get running time running_time = time.time() - start_time s = 'time: {:.4f}, lik: {:.4f}, rmse: {:.4f}' """STEP (3): Extract and return test performancee metrics to 'main'.""" #Get test errors lik, rmse = batch_assess(model, assess_sampled, X_test, Y_test) print(s.format(running_time, lik, rmse)) return -lik, rmse, running_time
X, Y, Xs, Ys, Y_std = [data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std']] print('############################ {} L={} split={}'.format( dataset_name, L, split)) print('N: {}, D: {}, Ns: {}'.format(X.shape[0], X.shape[1], Xs.shape[0])) Z = kmeans2(X, 100, minit='points')[0] D = X.shape[1] kernels = [] for l in range(L): kernels.append(RBF(D)) for kernel in kernels[:-1]: kernel += White(D, variance=2e-6) mb = minibatch_size if X.shape[0] > minibatch_size else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 model.likelihood.variance = 0.05 global_step = tf.Variable(0, trainable=False, name="global_step") model.enquire_session().run(global_step.initializer) s = "{}/{}_L{}_split{}".format(results_path, dataset_name, L, split) fw = tf.summary.FileWriter(os.path.join(s.format(dataset_name, L)), model.enquire_session().graph)
def main(args): datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile2 = open(outname2, 'w') running_loss = 0 running_time = 0 for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] for l in range(args.num_layers): kernels.append(SquaredExponential() + White(variance=1e-5)) dgp_model = DGP(X.shape[1], kernels, Gaussian(variance=0.05), Z, num_outputs=Y.shape[1], num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) optimiser = tf.optimizers.Adam(args.learning_rate) def optimisation_step(model, X, Y): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) obj = -model.elbo(X, Y, full_cov=False) grad = tape.gradient(obj, model.trainable_variables) optimiser.apply_gradients(zip(grad, model.trainable_variables)) def monitored_training_loop(model, train_dataset, logdir, iterations, logging_iter_freq): # TODO: use tensorboard to log trainables and performance tf_optimisation_step = tf.function(optimisation_step) batches = iter(train_dataset) for i in range(iterations): X, Y = next(batches) tf_optimisation_step(model, X, Y) iter_id = i + 1 if iter_id % logging_iter_freq == 0: tf.print( f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}') print('Training DGP model...') t0 = time.time() monitored_training_loop(dgp_model, train_dataset, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() print('Time taken to train: {}'.format(t1 - t0)) outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile2.flush() os.fsync(outfile2.fileno()) running_time += t1 - t0 m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std), 0, b=1 / float(args.test_samples))) print('Average test log likelihood: {}'.format(test_nll)) outfile1.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile1.flush() os.fsync(outfile1.fileno()) running_loss += t1 - t0 outfile1.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Average: {}\n'.format(running_time / args.splits)) outfile1.close() outfile2.close()
def main(args): datasets = Datasets(data_path=args.data_path) # prepare output files outname1 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str( args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../svgp_ard_tmp/svgp_ard_' + args.dataset + '_' + str( args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = '../svgp_ard_tmp/svgp_ard)' + args.dataset + '_' + str( args.num_inducing) + '.time' outfile3 = open(outname3, 'w') # ========================================================================= # CROSS-VALIDATION LOOP # ========================================================================= running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): # ===================================================================== # MODEL CONSTRUCTION # ===================================================================== print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data( i, normalize=args.normalize_data) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.batch_size if args.batch_size < X.shape[0]\ else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up SVGP model...') if args.ard: # SE kernel with lengthscale per dimension kernel = SquaredExponential(lengthscale=[1.] * X.shape[1]) + White(variance=1e-5) else: # SE kernel with single lengthscale kernel = SquaredExponential(lengthscale=1.) + White(variance=1e-5) likelihood = Gaussian(variance=0.05) model = gpflow.models.SVGP(kernel=kernel, likelihood=likelihood, inducing_variable=Z) # ===================================================================== # TRAINING # ===================================================================== print('Training SVGP model...') optimiser = tf.optimizers.Adam(args.learning_rate) t0 = time.time() monitored_training_loop(model, train_dataset, optimiser=optimiser, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() # ===================================================================== # TESTING # ===================================================================== test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :]) means.append(m) vars.append(v) else: m, v = model.predict_y(Xs) means.append(m) vars.append(v) mean_ND = np.concatenate(means, 0) # [N, D] var_ND = np.concatenate(vars, 0) # [N, D] # rmse test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err # nll test_nll = np.mean( norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std)) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()