def _brute_fit(self, data_points, target_values, max_iter=None): """ Optimizes covariance hyper-parameters :param data_points: an array of data points :param target_values: target values' vector :return: """ if not (isinstance(data_points, np.ndarray) and isinstance(target_values, np.ndarray)): raise TypeError("The operands must be of type numpy array") def loc_fun(w): loss, grad = self._oracle(data_points, target_values, w) return -loss, -grad bnds = self.covariance_obj.get_bounds() if max_iter is None: max_iter = np.inf res, w_list, time_list = minimize_wrapper( loc_fun, self.covariance_obj.get_params(), method='L-BFGS-B', mydisp=False, bounds=bnds, options={ 'gtol': 1e-8, 'ftol': 0, 'maxiter': max_iter }) optimal_params = res.x self.covariance_obj.set_params(optimal_params) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))
def _brute_fit(self, data_points, target_values, max_iter=None): """ Optimizes covariance hyper-parameters :param data_points: an array of data points :param target_values: target values' vector :return: """ if not(isinstance(data_points, np.ndarray) and isinstance(target_values, np.ndarray)): raise TypeError("The operands must be of type numpy array") def loc_fun(w): loss, grad = self._oracle(data_points, target_values, w) return -loss, -grad bnds = self.covariance_obj.get_bounds() if max_iter is None: max_iter = np.inf res, w_list, time_list = minimize_wrapper(loc_fun, self.covariance_obj.get_params(), method='L-BFGS-B', mydisp=False, bounds=bnds, options={'gtol': 1e-8, 'ftol': 0, 'maxiter': max_iter}) optimal_params = res.x self.covariance_obj.set_params(optimal_params) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))
def _svi_fit(self, data_points, target_values, num_inputs=0, inputs=None, optimizer_options={}): """ A method for optimizing hyper-parameters (for fixed inducing points), based on stochastic variational inference :param data_points: training set objects :param target_values: training set answers :param inputs: inducing inputs :param num_inputs: number of inducing points to generate. If inducing points are provided, this parameter is ignored :param max_iter: maximum number of iterations in stochastic gradient descent :return: """ # if no inducing inputs are provided, we use K-Means cluster centers as inducing inputs if inputs is None: means = KMeans(n_clusters=num_inputs) means.fit(data_points.T) inputs = means.cluster_centers_.T # inputs = np.load("inputs.npy") # Initializing required variables y = target_values m = num_inputs n = y.size # Initializing variational (normal) distribution parameters mu = np.zeros((m, 1)) sigma_n = self.covariance_obj.get_params()[-1] theta = self.covariance_obj.get_params() if self.parametrization == 'natural': cov_fun = self.covariance_obj.covariance_function K_mn = cov_fun(inputs, data_points) K_mm = cov_fun(inputs, inputs) K_mm_inv = np.linalg.inv(K_mm) sigma_inv = K_mm_inv.dot(K_mn.dot( K_mn.T.dot(K_mm_inv))) / sigma_n**2 + K_mm_inv sigma = np.linalg.inv(sigma_inv) mu = sigma.dot(K_mm_inv.dot((K_mn.dot(y)))) / sigma_n**2 # Canonical parameters initialization eta_1 = sigma_inv.dot(mu) eta_2 = -sigma_inv / 2 param_vec = self._svi_get_parameter_vector(theta, eta_1, eta_2) elif self.parametrization == 'cholesky': # sigma_L = np.eye(m) # mu = np.random.multivariate_normal(mean=np.zeros_like(mu)[:,0], cov=np.eye(mu.size)*5) # sigma_L = np.eye(m) # Cholesky factor of sigma cov_fun = self.covariance_obj.covariance_function K_mn = cov_fun(inputs, data_points) K_mm = cov_fun(inputs, inputs) K_mm_inv = np.linalg.inv(K_mm) sigma = np.linalg.inv( K_mm_inv.dot(K_mn.dot(K_mn.T.dot(K_mm_inv))) / sigma_n**2 + K_mm_inv) mu = sigma.dot(K_mm_inv.dot((K_mn.dot(y)))) / sigma_n**2 # p = np.random.normal(size=(m, 1)) # sigma = p.dot(p.T) + np.eye(m) * 1e-4 sigma_L = np.linalg.cholesky(sigma) param_vec = self._svi_get_parameter_vector(theta, mu, sigma_L) bnds = self._svi_get_bounds(m) if self.parametrization == 'natural': nat_mult = 1. if not optimizer_options is None: if 'nat_mult' in optimizer_options.keys(): nat_mult = optimizer_options['nat_mult'] del optimizer_options['nat_mult'] print(nat_mult) def stoch_fun(x, i): grad = -self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i)[1] grad[self.covariance_obj.get_params().size:] *= nat_mult return grad def adadelta_fun(x, train_points, train_targets): _, grad = self._svi_elbo_batch_approx_oracle( train_points, train_targets, inputs, parameter_vec=x, indices=range(train_targets.size), N=n) grad[self.covariance_obj.get_params().size:] *= nat_mult return -grad # indices = list(range(20)) # # mu += np.random.randn(mu.size).reshape(mu.shape)*2 # beta_1 = mu # beta_2 = mu.dot(mu.T) + sigma # param_vec = self._svi_get_parameter_vector(theta, beta_1, beta_2) # # def test_fun(x): # fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, # indices=indices) # return -fun, -grad # check_gradient(test_fun, param_vec, print_diff=True, delta=1e-9)#, indices=[3,4,5,6,7]) # exit(0) # if self.optimizer == 'SG': res, w_list, time_list = stochastic_gradient_descent( oracle=stoch_fun, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'AdaDelta': res, w_list, time_list = climin_wrapper( oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='AdaDelta') elif self.optimizer == 'climinSG': res, w_list, time_list = climin_wrapper( oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='SG') else: raise ValueError('Unknown optimizer') theta, eta_1, eta_2 = self._svi_get_parameters(res) sigma_inv = -2 * eta_2 sigma = np.linalg.inv(sigma_inv) mu = sigma.dot(eta_1) elif self.parametrization == 'cholesky': def fun(x): fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=list( range(n))) return -fun, -grad def sag_oracle(x, i): fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i) return -fun, -grad def adadelta_fun(x, train_points, train_targets): fun, grad = self._svi_elbo_batch_approx_oracle( train_points, train_targets, inputs, parameter_vec=x, indices=range(train_targets.size), N=n) return -grad def stoch_fun(x, i): return -self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i)[1] if self.optimizer == 'AdaDelta': res, w_list, time_list = climin_wrapper( oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='AdaDelta') elif self.optimizer == 'climinSG': res, w_list, time_list = climin_wrapper( oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='SG') elif self.optimizer == 'SG': res, w_list, time_list = stochastic_gradient_descent( oracle=stoch_fun, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'SAG': res, w_list, time_list = stochastic_average_gradient( oracle=sag_oracle, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'FG': res, w_list, time_list = gradient_descent( oracle=fun, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'L-BFGS-B': mydisp = False print_freq = 1 if not optimizer_options is None: if 'mydisp' in optimizer_options.keys(): mydisp = optimizer_options['mydisp'] del optimizer_options['mydisp'] if 'print_freq' in optimizer_options.keys(): print_freq = optimizer_options['print_freq'] del optimizer_options['print_freq'] res, w_list, time_list = minimize_wrapper( fun, param_vec, method='L-BFGS-B', mydisp=mydisp, print_freq=print_freq, bounds=bnds, jac=True, options=optimizer_options) res = res['x'] else: raise ValueError('Wrong optimizer for svi method' + self.optimizer) theta, mu, sigma_L = self._svi_get_parameters(res) sigma = sigma_L.dot(sigma_L.T) self.covariance_obj.set_params(theta) self.inducing_inputs = (inputs, mu, sigma) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))
def _vi_means_fit(self, data_points, target_values, num_inputs, inputs=None, optimizer_options={}): """ A procedure, fitting hyper-parameters and inducing points for both the 'means' and the 'vi' methods. :param data_points: data points :param target_values: target values at data points :param num_inputs: number of inducing inputs to be found :param max_iter: maximum number of iterations :return: lists of iteration-wise values of hyper-parameters, times, function values for evaluating the optimization """ if not (isinstance(data_points, np.ndarray) and isinstance(target_values, np.ndarray)): raise TypeError("The operands must be of type numpy array") dim = data_points.shape[0] param_len = self.covariance_obj.get_params().size def _vi_loc_fun(w): ind_points = (w[param_len:]).reshape( (dim, num_inputs)) # has to be rewritten for multidimensional case loss, grad = self._vi_means_oracle(data_points, target_values, w[:param_len], ind_points) return -loss, -grad def _means_loc_fun(w): loss, grad = self._vi_means_oracle(data_points, target_values, w, inputs) return -loss, -grad np.random.seed(15) if self.method == 'vi': inputs = data_points[:, :num_inputs] + np.random.normal( 0, 0.1, (dim, num_inputs)) loc_fun = _vi_loc_fun w0 = np.concatenate( (self.covariance_obj.get_params(), inputs.ravel())) bnds = tuple( list(self.covariance_obj.get_bounds()) + [(1e-2, 1)] * num_inputs * dim) if self.method == 'means': if inputs is None: inputs = self._k_means_cluster_centers(data_points, num_inputs) loc_fun = _means_loc_fun w0 = self.covariance_obj.get_params() bnds = self.covariance_obj.get_bounds() if self.optimizer == 'L-BFGS-B': mydisp = False options = copy.deepcopy(optimizer_options) if not optimizer_options is None: if 'mydisp' in optimizer_options.keys(): mydisp = optimizer_options['mydisp'] del options['mydisp'] res, w_list, time_list = minimize_wrapper(loc_fun, w0, method='L-BFGS-B', mydisp=mydisp, bounds=bnds, options=options) res = res.x elif self.optimizer == 'Projected Newton': res, w_list, time_list = projected_newton( loc_fun, w0, bounds=bnds, options=optimizer_options) else: raise ValueError('Wrong optimizer for svi/means method:' + self.optimizer) if self.method == 'vi': optimal_params = res[:-num_inputs * dim] inducing_points = res[-num_inputs * dim:] inducing_points = inducing_points.reshape((dim, num_inputs)) if self.method == 'means': optimal_params = res inducing_points = inputs self.covariance_obj.set_params(optimal_params) mu, Sigma = self._vi_get_optimal_meancov(optimal_params, inducing_points, data_points, target_values) self.inducing_inputs = (inducing_points, mu, Sigma) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))
def _svi_fit(self, data_points, target_values, num_inputs=0, inputs=None, optimizer_options={}): """ A method for optimizing hyper-parameters (for fixed inducing points), based on stochastic variational inference :param data_points: training set objects :param target_values: training set answers :param inputs: inducing inputs :param num_inputs: number of inducing points to generate. If inducing points are provided, this parameter is ignored :param max_iter: maximum number of iterations in stochastic gradient descent :return: """ # if no inducing inputs are provided, we use K-Means cluster centers as inducing inputs if inputs is None: means = KMeans(n_clusters=num_inputs) means.fit(data_points.T) inputs = means.cluster_centers_.T # inputs = np.load("inputs.npy") # Initializing required variables y = target_values m = num_inputs n = y.size # Initializing variational (normal) distribution parameters mu = np.zeros((m, 1)) sigma_n = self.covariance_obj.get_params()[-1] theta = self.covariance_obj.get_params() if self.parametrization == 'natural': cov_fun = self.covariance_obj.covariance_function K_mn = cov_fun(inputs, data_points) K_mm = cov_fun(inputs, inputs) K_mm_inv = np.linalg.inv(K_mm) sigma_inv = K_mm_inv.dot(K_mn.dot(K_mn.T.dot(K_mm_inv)))/sigma_n**2 + K_mm_inv sigma = np.linalg.inv(sigma_inv) mu = sigma.dot(K_mm_inv.dot((K_mn.dot(y)))) / sigma_n**2 # Canonical parameters initialization eta_1 = sigma_inv.dot(mu) eta_2 = - sigma_inv / 2 param_vec = self._svi_get_parameter_vector(theta, eta_1, eta_2) elif self.parametrization == 'cholesky': # sigma_L = np.eye(m) # mu = np.random.multivariate_normal(mean=np.zeros_like(mu)[:,0], cov=np.eye(mu.size)*5) # sigma_L = np.eye(m) # Cholesky factor of sigma cov_fun = self.covariance_obj.covariance_function K_mn = cov_fun(inputs, data_points) K_mm = cov_fun(inputs, inputs) K_mm_inv = np.linalg.inv(K_mm) sigma = np.linalg.inv(K_mm_inv.dot(K_mn.dot(K_mn.T.dot(K_mm_inv)))/sigma_n**2 + K_mm_inv) mu = sigma.dot(K_mm_inv.dot((K_mn.dot(y)))) / sigma_n**2 # p = np.random.normal(size=(m, 1)) # sigma = p.dot(p.T) + np.eye(m) * 1e-4 sigma_L = np.linalg.cholesky(sigma) param_vec = self._svi_get_parameter_vector(theta, mu, sigma_L) bnds = self._svi_get_bounds(m) if self.parametrization == 'natural': nat_mult = 1. if not optimizer_options is None: if 'nat_mult' in optimizer_options.keys(): nat_mult = optimizer_options['nat_mult'] del optimizer_options['nat_mult'] print(nat_mult) def stoch_fun(x, i): grad = -self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i)[1] grad[self.covariance_obj.get_params().size:] *= nat_mult return grad def adadelta_fun(x, train_points, train_targets): _, grad = self._svi_elbo_batch_approx_oracle(train_points, train_targets, inputs, parameter_vec=x, indices=range(train_targets.size), N=n) grad[self.covariance_obj.get_params().size:] *= nat_mult return -grad # indices = list(range(20)) # # mu += np.random.randn(mu.size).reshape(mu.shape)*2 # beta_1 = mu # beta_2 = mu.dot(mu.T) + sigma # param_vec = self._svi_get_parameter_vector(theta, beta_1, beta_2) # # def test_fun(x): # fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, # indices=indices) # return -fun, -grad # check_gradient(test_fun, param_vec, print_diff=True, delta=1e-9)#, indices=[3,4,5,6,7]) # exit(0) # if self.optimizer == 'SG': res, w_list, time_list = stochastic_gradient_descent(oracle=stoch_fun, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'AdaDelta': res, w_list, time_list = climin_wrapper(oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='AdaDelta') elif self.optimizer == 'climinSG': res, w_list, time_list = climin_wrapper(oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='SG') else: raise ValueError('Unknown optimizer') theta, eta_1, eta_2 = self._svi_get_parameters(res) sigma_inv = - 2 * eta_2 sigma = np.linalg.inv(sigma_inv) mu = sigma.dot(eta_1) elif self.parametrization == 'cholesky': def fun(x): fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=list(range(n))) return -fun, -grad def sag_oracle(x, i): fun, grad = self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i) return -fun, -grad def adadelta_fun(x, train_points, train_targets): fun, grad = self._svi_elbo_batch_approx_oracle(train_points, train_targets, inputs, parameter_vec=x, indices=range(train_targets.size), N=n) return -grad def stoch_fun(x, i): return -self._svi_elbo_batch_approx_oracle(data_points, target_values, inputs, parameter_vec=x, indices=i)[1] if self.optimizer == 'AdaDelta': res, w_list, time_list = climin_wrapper(oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='AdaDelta') elif self.optimizer == 'climinSG': res, w_list, time_list = climin_wrapper(oracle=adadelta_fun, w0=param_vec, train_points=data_points, train_targets=target_values, options=optimizer_options, method='SG') elif self.optimizer == 'SG': res, w_list, time_list = stochastic_gradient_descent(oracle=stoch_fun, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'SAG': res, w_list, time_list = stochastic_average_gradient(oracle=sag_oracle, n=n, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'FG': res, w_list, time_list = gradient_descent(oracle=fun, point=param_vec, bounds=bnds, options=optimizer_options) elif self.optimizer == 'L-BFGS-B': mydisp = False print_freq=1 if not optimizer_options is None: if 'mydisp' in optimizer_options.keys(): mydisp = optimizer_options['mydisp'] del optimizer_options['mydisp'] if 'print_freq' in optimizer_options.keys(): print_freq = optimizer_options['print_freq'] del optimizer_options['print_freq'] res, w_list, time_list = minimize_wrapper(fun, param_vec, method='L-BFGS-B', mydisp=mydisp, print_freq=print_freq, bounds=bnds, jac=True, options=optimizer_options) res = res['x'] else: raise ValueError('Wrong optimizer for svi method' + self.optimizer) theta, mu, sigma_L = self._svi_get_parameters(res) sigma = sigma_L.dot(sigma_L.T) self.covariance_obj.set_params(theta) self.inducing_inputs = (inputs, mu, sigma) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))
def _vi_means_fit(self, data_points, target_values, num_inputs, inputs=None, optimizer_options={}): """ A procedure, fitting hyper-parameters and inducing points for both the 'means' and the 'vi' methods. :param data_points: data points :param target_values: target values at data points :param num_inputs: number of inducing inputs to be found :param max_iter: maximum number of iterations :return: lists of iteration-wise values of hyper-parameters, times, function values for evaluating the optimization """ if not(isinstance(data_points, np.ndarray) and isinstance(target_values, np.ndarray)): raise TypeError("The operands must be of type numpy array") dim = data_points.shape[0] param_len = self.covariance_obj.get_params().size def _vi_loc_fun(w): ind_points = (w[param_len:]).reshape((dim, num_inputs)) # has to be rewritten for multidimensional case loss, grad = self._vi_means_oracle(data_points, target_values, w[:param_len], ind_points) return -loss, -grad def _means_loc_fun(w): loss, grad = self._vi_means_oracle(data_points, target_values, w, inputs) return -loss, -grad np.random.seed(15) if self.method == 'vi': inputs = data_points[:, :num_inputs] + np.random.normal(0, 0.1, (dim, num_inputs)) loc_fun = _vi_loc_fun w0 = np.concatenate((self.covariance_obj.get_params(), inputs.ravel())) bnds = tuple(list(self.covariance_obj.get_bounds()) + [(1e-2, 1)] * num_inputs * dim) if self.method == 'means': if inputs is None: inputs = self._k_means_cluster_centers(data_points, num_inputs) loc_fun = _means_loc_fun w0 = self.covariance_obj.get_params() bnds = self.covariance_obj.get_bounds() if self.optimizer == 'L-BFGS-B': mydisp = False options = copy.deepcopy(optimizer_options) if not optimizer_options is None: if 'mydisp' in optimizer_options.keys(): mydisp = optimizer_options['mydisp'] del options['mydisp'] res, w_list, time_list = minimize_wrapper(loc_fun, w0, method='L-BFGS-B', mydisp=mydisp, bounds=bnds, options=options) res = res.x elif self.optimizer == 'Projected Newton': res, w_list, time_list = projected_newton(loc_fun, w0, bounds=bnds, options=optimizer_options) else: raise ValueError('Wrong optimizer for svi/means method:' + self.optimizer) if self.method == 'vi': optimal_params = res[:-num_inputs*dim] inducing_points = res[-num_inputs*dim:] inducing_points = inducing_points.reshape((dim, num_inputs)) if self.method == 'means': optimal_params = res inducing_points = inputs self.covariance_obj.set_params(optimal_params) mu, Sigma = self._vi_get_optimal_meancov(optimal_params, inducing_points, data_points, target_values) self.inducing_inputs = (inducing_points, mu, Sigma) return GPRes(deepcopy(w_list), time_lst=deepcopy(time_list))