def test_kernels(self): from GPy.kern import RBF,Linear,MLP,Bias,White Q = self.Z.shape[1] kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True),MLP(Q,ARD=True), RBF(Q,ARD=True)+Linear(Q,ARD=True)+Bias(Q)+White(Q) ,RBF(Q,ARD=True)+Bias(Q)+White(Q), Linear(Q,ARD=True)+Bias(Q)+White(Q)] for k in kernels: k.randomize() self._test_kernel_param(k) self._test_Z(k) self._test_qX(k) self._test_kernel_param(k, psi2n=True) self._test_Z(k, psi2n=True) self._test_qX(k, psi2n=True)
def _create_kernel(self, V): self._kerns = [RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
class GP(BaseStrategy): short_name = 'gp' def __init__(self, acquisition=None, seed=None, seeds=1, max_feval=5E4, max_iter=1E5): self.seed = seed self.seeds = seeds self.max_feval = max_feval self.max_iter = max_iter self.model = None self.n_dims = None self.kernel = None self._kerns = None self._kernf = None self._kernb = None if acquisition is None: acquisition = {'name': 'osprey', 'params': {}} self.acquisition_function = acquisition self._acquisition_function = None self._set_acquisition() def _create_kernel(self, V): self._kerns = [RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb def _fit_model(self, X, Y): model = GPRegression(X, Y, self.kernel) model.optimize(messages=False, max_f_eval=self.max_feval) self.model = model def _get_random_point(self): return np.array([np.random.uniform(low=0., high=1.) for i in range(self.n_dims)]) def _is_var_positive(self, var): if np.any(var < 0): # RuntimeError may be overkill raise RuntimeError('Negative variance predicted from regression model.') else: return True def _ei(self, x, y_mean, y_var): y_std = np.sqrt(y_var) y_best = self.model.Y.max(axis=0) z = (y_mean - y_best)/y_std result = y_std*(z*norm.cdf(z) + norm.pdf(z)) return result def _ucb(self, x, y_mean, y_var, kappa=1.0): result = y_mean + kappa*np.sqrt(y_var) return result def _osprey(self, x, y_mean, y_var): return (y_mean+y_var).flatten() def _optimize(self, init=None): # TODO start minimization from a range of points and take minimum if not init: init = self._get_random_point() def z(x): # TODO make spread of points around x and take mean value. x = x.copy().reshape(-1, self.n_dims) y_mean, y_var = self.model.predict(x, kern=(np.sum(self._kerns).copy() + self._kernb.copy())) # This code is for debug/testing phase only. # Ideally we should test for negative variance regardless of the AF. # However, we want to recover the original functionality of Osprey, hence the conditional block. # TODO remove this. if self.acquisition_function['name'] == 'osprey': af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var) elif self.acquisition_function['name'] in ['ei', 'ucb']: # y_var = np.abs(y_var) if self._is_var_positive(y_var): af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var) return (-1)*af res = minimize(z, init, bounds=self.n_dims*[(0., 1.)], options={'maxiter': self.max_iter, 'disp': 0}) return res.x def _set_acquisition(self): if isinstance(self.acquisition_function, list): raise RuntimeError('Must specify only one acquisition function') if sorted(self.acquisition_function.keys()) != ['name', 'params']: raise RuntimeError('strategy/params/acquisition must contain keys ' '"name" and "params"') if self.acquisition_function['name'] not in ['ei', 'ucb', 'osprey']: raise RuntimeError('strategy/params/acquisition name must be one of ' '"ei", "ucb", "osprey"') f = eval('self._'+self.acquisition_function['name']) def g(x, y_mean, y_var): return f(x, y_mean, y_var, **self.acquisition_function['params']) self._acquisition_function = g def _get_data(self, history, searchspace): X = [] Y = [] V = [] ignore = [] for param_dict, scores, status in history: # transform points into the GP domain. This invloves bringing # int and enum variables to floating point, etc. if status == 'FAILED': # not sure how to deal with these yet continue point = searchspace.point_to_gp(param_dict) if status == 'SUCCEEDED': X.append(point) Y.append(np.mean(scores)) V.append(np.var(scores)) elif status == 'PENDING': ignore.append(point) else: raise RuntimeError('unrecognized status: %s' % status) return (np.array(X).reshape(-1, self.n_dims), np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1), np.array(ignore).reshape(-1, self.n_dims)) def _from_gp(self, result, searchspace): # Note that GP only deals with float-valued variables, so we have # a transform step on either side, where int and enum valued variables # are transformed before calling gp, and then the result suggested by # GP needs to be reverse-transformed. out = {} for gpvalue, var in zip(result, searchspace): out[var.name] = var.point_from_gp(float(gpvalue)) return out def _is_within(self, point, X, tol=1E-2): if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol): return True return False def suggest(self, history, searchspace, max_tries=5): if not GPRegression: raise ImportError('No module named GPy') if not minimize: raise ImportError('No module named SciPy') if len(history) < self.seeds: return RandomSearch().suggest(history, searchspace) self.n_dims = searchspace.n_dims X, Y, V, ignore = self._get_data(history, searchspace) # TODO make _create_kernel accept optional args. self._create_kernel(V) self._fit_model(X, Y) suggestion = self._optimize() if suggestion in ignore or self._is_within(suggestion, X): return RandomSearch().suggest(history, searchspace) return self._from_gp(suggestion, searchspace)
class GP(BaseStrategy): short_name = 'gp' def __init__(self, seed=None, seeds=1, max_feval=5E4, max_iter=1E5): self.seed = seed self.seeds = seeds self.max_feval = max_feval self.max_iter = max_iter self.model = None self.n_dims = None self.kernel = None self._kerns = None self._kernf = None self._kernb = None def _create_kernel(self, V): self._kerns = [ RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims) ] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb def _fit_model(self, X, Y): model = GPRegression(X, Y, self.kernel) model.optimize(messages=False, max_f_eval=self.max_feval) self.model = model def _get_random_point(self): return np.array( [np.random.uniform(low=0., high=1.) for i in range(self.n_dims)]) def _optimize(self, init=None): if not init: init = self._get_random_point() def z(x): y = x.copy().reshape(-1, self.n_dims) s, v = self.model.predict(y, kern=(np.sum(self._kerns).copy() + self._kernb.copy())) return -(s + v).flatten() return minimize(z, init, bounds=self.n_dims * [(0., 1.)], options={ 'maxiter': self.max_iter, 'disp': 0 }).x def _get_data(self, history, searchspace): X = [] Y = [] V = [] ignore = [] for param_dict, scores, status in history: # transform points into the GP domain. This invloves bringing # int and enum variables to floating point, etc. if status == 'FAILED': # not sure how to deal with these yet continue point = searchspace.point_to_gp(param_dict) if status == 'SUCCEEDED': X.append(point) Y.append(np.mean(scores)) V.append(np.var(scores)) elif status == 'PENDING': ignore.append(point) else: raise RuntimeError('unrecognized status: %s' % status) return (np.array(X).reshape(-1, self.n_dims), np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1), np.array(ignore).reshape(-1, self.n_dims)) def _from_gp(self, result, searchspace): # Note that GP only deals with float-valued variables, so we have # a transform step on either side, where int and enum valued variables # are transformed before calling gp, and then the result suggested by # GP needs to be reverse-transformed. out = {} for gpvalue, var in zip(result, searchspace): out[var.name] = var.point_from_gp(float(gpvalue)) return out def _is_within(self, point, X, tol=1E-2): if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol): return True return False def suggest(self, history, searchspace, max_tries=5): if not GPRegression: raise ImportError('No module named GPy') if not minimize: raise ImportError('No module named SciPy') if len(history) < self.seeds: return RandomSearch().suggest(history, searchspace) self.n_dims = searchspace.n_dims X, Y, V, ignore = self._get_data(history, searchspace) self._create_kernel(V) self._fit_model(X, Y) suggestion = self._optimize() if suggestion in ignore or self._is_within(suggestion, X): return RandomSearch().suggest(history, searchspace) return self._from_gp(suggestion, searchspace)
class GP(BaseStrategy): short_name = 'gp' def __init__(self, acquisition=None, seed=None, seeds=1, max_feval=5E4, max_iter=1E5): self.seed = seed self.seeds = seeds self.max_feval = max_feval self.max_iter = max_iter self.model = None self.n_dims = None self.kernel = None self._kerns = None self._kernf = None self._kernb = None if acquisition is None: acquisition = {'name': 'osprey', 'params': {}} self.acquisition_function = acquisition self._acquisition_function = None self._set_acquisition() def _create_kernel(self, V): self._kerns = [ RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims) ] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb def _fit_model(self, X, Y): model = GPRegression(X, Y, self.kernel) model.optimize(messages=False, max_f_eval=self.max_feval) self.model = model def _get_random_point(self): return np.array( [np.random.uniform(low=0., high=1.) for i in range(self.n_dims)]) def _is_var_positive(self, var): if np.any(var < 0): # RuntimeError may be overkill raise RuntimeError( 'Negative variance predicted from regression model.') else: return True def _ei(self, x, y_mean, y_var): y_std = np.sqrt(y_var) y_best = self.model.Y.max(axis=0) z = (y_mean - y_best) / y_std result = y_std * (z * norm.cdf(z) + norm.pdf(z)) return result def _ucb(self, x, y_mean, y_var, kappa=1.0): result = y_mean + kappa * np.sqrt(y_var) return result def _osprey(self, x, y_mean, y_var): return (y_mean + y_var).flatten() def _optimize(self, init=None): # TODO start minimization from a range of points and take minimum if not init: init = self._get_random_point() def z(x): # TODO make spread of points around x and take mean value. x = x.copy().reshape(-1, self.n_dims) y_mean, y_var = self.model.predict( x, kern=(np.sum(self._kerns).copy() + self._kernb.copy())) # This code is for debug/testing phase only. # Ideally we should test for negative variance regardless of the AF. # However, we want to recover the original functionality of Osprey, hence the conditional block. # TODO remove this. if self.acquisition_function['name'] == 'osprey': af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var) elif self.acquisition_function['name'] in ['ei', 'ucb']: # y_var = np.abs(y_var) if self._is_var_positive(y_var): af = self._acquisition_function(x, y_mean=y_mean, y_var=y_var) return (-1) * af res = minimize(z, init, bounds=self.n_dims * [(0., 1.)], options={ 'maxiter': self.max_iter, 'disp': 0 }) return res.x def _set_acquisition(self): if isinstance(self.acquisition_function, list): raise RuntimeError('Must specify only one acquisition function') if sorted(self.acquisition_function.keys()) != ['name', 'params']: raise RuntimeError('strategy/params/acquisition must contain keys ' '"name" and "params"') if self.acquisition_function['name'] not in ['ei', 'ucb', 'osprey']: raise RuntimeError( 'strategy/params/acquisition name must be one of ' '"ei", "ucb", "osprey"') f = eval('self._' + self.acquisition_function['name']) def g(x, y_mean, y_var): return f(x, y_mean, y_var, **self.acquisition_function['params']) self._acquisition_function = g def _get_data(self, history, searchspace): X = [] Y = [] V = [] ignore = [] for param_dict, scores, status in history: # transform points into the GP domain. This invloves bringing # int and enum variables to floating point, etc. if status == 'FAILED': # not sure how to deal with these yet continue point = searchspace.point_to_gp(param_dict) if status == 'SUCCEEDED': X.append(point) Y.append(np.mean(scores)) V.append(np.var(scores)) elif status == 'PENDING': ignore.append(point) else: raise RuntimeError('unrecognized status: %s' % status) return (np.array(X).reshape(-1, self.n_dims), np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1), np.array(ignore).reshape(-1, self.n_dims)) def _from_gp(self, result, searchspace): # Note that GP only deals with float-valued variables, so we have # a transform step on either side, where int and enum valued variables # are transformed before calling gp, and then the result suggested by # GP needs to be reverse-transformed. out = {} for gpvalue, var in zip(result, searchspace): out[var.name] = var.point_from_gp(float(gpvalue)) return out def _is_within(self, point, X, tol=1E-2): if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol): return True return False def suggest(self, history, searchspace, max_tries=5): if not GPRegression: raise ImportError('No module named GPy') if not minimize: raise ImportError('No module named SciPy') if len(history) < self.seeds: return RandomSearch().suggest(history, searchspace) self.n_dims = searchspace.n_dims X, Y, V, ignore = self._get_data(history, searchspace) # TODO make _create_kernel accept optional args. self._create_kernel(V) self._fit_model(X, Y) suggestion = self._optimize() if suggestion in ignore or self._is_within(suggestion, X): return RandomSearch().suggest(history, searchspace) return self._from_gp(suggestion, searchspace)
class GP(BaseStrategy): short_name = 'gp' def __init__(self, seeds=1, max_feval=5E4, max_iter=1E5): self.seeds = seeds self.max_feval = max_feval self.max_iter = max_iter self.model = None self.n_dims = None self.kernel = None self._kerns = None self._kernf = None self._kernb = None def _create_kernel(self, V): self._kerns = [RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)] self._kernf = Fixed(self.n_dims, tdot(V)) self._kernb = Bias(self.n_dims) self.kernel = np.sum(self._kerns) + self._kernf + self._kernb def _fit_model(self, X, Y): model = GPRegression(X, Y, self.kernel) model.optimize(messages=False, max_f_eval=self.max_feval) self.model = model def _get_random_point(self): return np.array([np.random.uniform(low=0., high=1.) for i in range(self.n_dims)]) def _optimize(self, init=None): if not init: init = self._get_random_point() def z(x): y = x.copy().reshape(-1, self.n_dims) s, v = self.model.predict(y, kern=(np.sum(self._kerns).copy() + self._kernb.copy())) return -(s+v).flatten() return minimize(z, init, bounds=self.n_dims*[(0., 1.)], options={'maxiter': self.max_iter, 'disp': 0}).x def _get_data(self, history, searchspace): X = [] Y = [] V = [] ignore = [] for param_dict, scores, status in history: # transform points into the GP domain. This invloves bringing # int and enum variables to floating point, etc. if status == 'FAILED': # not sure how to deal with these yet continue point = searchspace.point_to_gp(param_dict) if status == 'SUCCEEDED': X.append(point) Y.append(np.mean(scores)) V.append(np.var(scores)) elif status == 'PENDING': ignore.append(point) else: raise RuntimeError('unrecognized status: %s' % status) return (np.array(X).reshape(-1, self.n_dims), np.array(Y).reshape(-1, 1), np.array(V).reshape(-1, 1), np.array(ignore).reshape(-1, self.n_dims)) def _from_gp(self, result, searchspace): # Note that GP only deals with float-valued variables, so we have # a transform step on either side, where int and enum valued variables # are transformed before calling gp, and then the result suggested by # GP needs to be reverse-transformed. out = {} for gpvalue, var in zip(result, searchspace): out[var.name] = var.point_from_gp(float(gpvalue)) return out def _is_within(self, point, X, tol=1E-2): if True in (np.sqrt(((point - X)**2).sum(axis=0)) <= tol): return True return False def suggest(self, history, searchspace, max_tries=5): if not GPRegression: raise ImportError('No module named GPy') if not minimize: raise ImportError('No module named SciPy') if len(history) < self.seeds: return RandomSearch().suggest(history, searchspace) self.n_dims = searchspace.n_dims X, Y, V, ignore = self._get_data(history, searchspace) self._create_kernel(V) self._fit_model(X, Y) suggestion = self._optimize() if suggestion in ignore or self._is_within(suggestion, X): return RandomSearch().suggest(history, searchspace) return self._from_gp(suggestion, searchspace)
def gp_on_fold(feature_sets, train, test, y, y_all, learn_options): sequences = np.array([str(x) for x in y_all.index.get_level_values(0).tolist()]) kern = WeightedDegree( 1, sequences, d=learn_options["kernel degree"], active_dims=[0] ) X = np.arange(len(train))[:, None] current_dim = 1 if "gc_count" in feature_sets: kern += RBF(1, active_dims=[current_dim], name="GC_rbf") X = np.concatenate((X, feature_sets["gc_count"].values), axis=1) current_dim += 1 if X.shape[1] != current_dim: raise AssertionError("incorrect number of columns") if "drug" in feature_sets: Q = feature_sets["drug"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="drug_lin" ) X = np.concatenate((X, feature_sets["drug"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "gene effect" in feature_sets: Q = feature_sets["gene effect"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="gene_lin" ) X = np.concatenate((X, feature_sets["gene effect"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Percent Peptide" in feature_sets: Q = feature_sets["Percent Peptide"].values.shape[1] kern += RBF( Q, active_dims=range(current_dim, current_dim + Q), name="percent_pept" ) X = np.concatenate((X, feature_sets["Percent Peptide"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Nucleotide cut position" in feature_sets: Q = feature_sets["Nucleotide cut position"].values.shape[1] kern += RBF( Q, active_dims=range(current_dim, current_dim + Q), name="nucleo_cut" ) X = np.concatenate((X, feature_sets["Nucleotide cut position"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Strand effect" in feature_sets: Q = feature_sets["Strand effect"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="strand" ) X = np.concatenate((X, feature_sets["Strand effect"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "NGGX" in feature_sets: Q = feature_sets["NGGX"].values.shape[1] kern += Linear(Q, active_dims=range(current_dim, current_dim + Q), name="NGGX") X = np.concatenate((X, feature_sets["NGGX"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "TM" in feature_sets: Q = feature_sets["TM"].values.shape[1] kern += RBF( Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="TM" ) X = np.concatenate((X, feature_sets["TM"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "gene features" in feature_sets: Q = feature_sets["gene features"].values.shape[1] kern += Linear( Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="genefeat", ) X = np.concatenate((X, feature_sets["gene features"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") kern += Bias(X.shape[1]) if learn_options["warpedGP"]: m = WarpedGP(X[train], y[train], kernel=kern) else: m = GPRegression(X[train], y[train], kernel=kern) m.optimize_restarts(3) y_pred, _ = m.predict(X[test]) # TODO add offset such that low scores are around 0 (not -4 or so) return y_pred, m[:]