def fit( self, X, Y, sampler="variational", tune=500, draws=500, vi_params={ "n": 20000, "method": "advi", "callbacks": [CheckParametersConvergence()], }, **kwargs, ): """ Fit a generalized nested logit model on the provided set of queries X and choices Y of those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network the categorical cross entropy loss function for each object :math:`x_i \\in Q` is defined as: .. math:: C_{i} = -y(i)\\log(P_i) \\enspace, where :math:`y` is ground-truth discrete choice vector of the objects in the given query set :math:`Q`. The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. Parameters ---------- X : numpy array (n_instances, n_objects, n_features) Feature vectors of the objects Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘variational’, ‘metropolis’, ‘nuts’}, string The sampler used to estimate the posterior mean and mass matrix from the trace * **variational** : Run inference methods to estimate posterior mean and diagonal mass matrix * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler * **nuts** : Use the No-U-Turn sampler vi_params : dict The parameters for the **variational** inference method draws : int The number of samples to draw. Defaults to 500. The number of tuned samples are discarded by default tune : int Number of iterations to tune, defaults to 500. Ignored when using 'SMC'. Samplers adjust the step sizes, scalings or similar during tuning. Tuning samples will be drawn in addition to the number specified in the `draws` argument, and will be discarded unless `discard_tuned_samples` is set to False. **kwargs : Keyword arguments for the fit function of :meth:`pymc3.fit`or :meth:`pymc3.sample` """ self._pre_fit() _n_instances, self.n_objects_fit_, self.n_object_features_fit_ = X.shape if self.n_nests is None: # TODO this looks like a bug to me, but it was already done this way # before (moved out of __init__). The `n_objects` summand probably # should be removed. self.n_nests = self.n_objects_fit_ + int(self.n_objects_fit_ / 2) self.construct_model(X, Y) fit_pymc3_model(self, sampler, draws, tune, vi_params, **kwargs) return self
def fit_advi(self, n=3, method='advi', n_type='restart'): r"""Find posterior using ADVI (maximising likehood of the data and minimising KL-divergence of posterior to prior) :param n: number of independent initialisations :param method: to allow for potential use of SVGD or MCMC (currently only ADVI implemented). :param n_type: type of repeated initialisation: 'restart' to pick different initial value, 'cv' for molecular cross-validation - splits counts into n datasets, for now, only n=2 is implemented 'bootstrap' for fitting the model to multiple downsampled datasets. Run `mod.bootstrap_data()` to generate variants of data ' :return: self.mean_field dictionary with MeanField pymc3 objects. """ if not np.isin(n_type, ['restart', 'cv', 'bootstrap']): raise ValueError( "n_type should be one of ['restart', 'cv', 'bootstrap']") self.mean_field = {} self.samples = {} self.node_samples = {} self.n_type = n_type if np.isin(n_type, ['bootstrap']): if self.X_data_sample is None: self.bootstrap_data(n=n) elif np.isin(n_type, ['cv']): self.generate_cv_data(n=n) # cv data added to self.X_data_sample init_names = ['init_' + str(i + 1) for i in np.arange(n)] with self.model: for i, name in enumerate(init_names): # when type is molecular cross-validation or bootstrap, # replace self.x_data tensor with new data if np.isin(n_type, ['cv', 'bootstrap']): more_replacements = { self.x_data: self.X_data_sample[i].astype(self.data_type) } else: more_replacements = {} # train the model self.mean_field[name] = pm.fit( self.n_iter, method='advi', callbacks=[CheckParametersConvergence()], obj_optimizer=pm.adam(learning_rate=self.learning_rate), total_grad_norm_constraint=self.total_grad_norm_constraint, more_replacements=more_replacements) # plot training history if self.verbose: print( plt.plot(np.log10(self.mean_field[name].hist[15000:])))
def test_choice_function_fixed(trivial_choice_problem, name): tf.set_random_seed(0) os.environ["KERAS_BACKEND"] = "tensorflow" np.random.seed(123) x, y = trivial_choice_problem choice_function = choice_functions[name][0] params, accuracies = choice_functions[name][1], choice_functions[name][2] params["n_objects"], params["n_object_features"] = tuple(x.shape[1:]) learner = choice_function(**params) if name == GLM_CHOICE: learner.fit( x, y, vi_params={ "n": 100, "method": "advi", "callbacks": [CheckParametersConvergence()], }, ) elif "linear" in name: learner.fit(x, y, epochs=10, validation_split=0, verbose=False) else: learner.fit(x, y, epochs=100, validation_split=0, verbose=False) s_pred = learner.predict_scores(x) y_pred = learner.predict_for_scores(s_pred) y_pred_2 = learner.predict(x) rtol = 1e-2 atol = 5e-2 assert np.isclose( 0.0, subset_01_loss(y_pred, y_pred_2), rtol=rtol, atol=atol, equal_nan=False ) for key, value in accuracies.items(): metric = choice_metrics[key] if metric in metrics_on_predictions: pred_loss = metric(y, y_pred) else: pred_loss = metric(y, s_pred) assert np.isclose(value, pred_loss, rtol=rtol, atol=atol, equal_nan=False) params = { "n_hidden": 20, "n_units": 20, "n_hidden_set_units": 2, "n_hidden_set_layers": 10, "n_hidden_joint_units": 2, "n_hidden_joint_layers": 10, "reg_strength": 1e-3, "learning_rate": 1e-1, "batch_size": 32, "alpha": 0.5, "l1_ratio": 0.7, "tol": 1e-2, "C": 10, "n_mixtures": 10, "n_nests": 5, "regularization": "l2", } learner.set_tunable_parameters(**params) check_params_tunable(learner, params, rtol, atol)
def fit_pymc3_model(self, sampler, draws, tune, vi_params, **kwargs): callbacks = vi_params.get("callbacks", []) for i, c in enumerate(callbacks): if isinstance(c, CheckParametersConvergence): params = c.__dict__ params.pop("_diff") params.pop("prev") params.pop("ord") params["diff"] = "absolute" callbacks[i] = CheckParametersConvergence(**params) if sampler == "variational": with self.model: try: self.trace_ = pm.sample(chains=2, cores=8, tune=5, draws=5) vi_params["start"] = self.trace_[-1] self.trace_vi_ = pm.fit(**vi_params) self.trace_ = self.trace_vi_.sample(draws=draws) except Exception as e: if hasattr(e, "message"): message = e.message else: message = e logger.error(message) self.trace_vi_ = None if self.trace_vi_ is None and self.trace_ is None: with self.model: logger.info( "Error in vi ADVI sampler using Metropolis sampler with draws {}" .format(draws)) self.trace = pm.sample(chains=1, cores=4, tune=20, draws=20, step=pm.NUTS()) elif sampler == "metropolis": with self.model: start = pm.find_MAP() self.trace_ = pm.sample( chains=2, cores=8, tune=tune, draws=draws, **kwargs, step=pm.Metropolis(), start=start, ) else: with self.model: self.trace_ = pm.sample(chains=2, cores=8, tune=tune, draws=draws, **kwargs, step=pm.NUTS())
def inference_with_model(model): with model: advi = pm.ADVI() tracker = pm.callbacks.Tracker(mean=advi.approx.mean.eval, std=advi.approx.std.eval) mean_field = advi.fit( n=vi_params["n"], callbacks=[CheckParametersConvergence(), tracker], ) vi_trace = mean_field.sample(draws=sampler_params["draws"]) return advi, vi_trace, mean_field, tracker
def fit_pymc3_model(self, sampler, draws, tune, vi_params, **kwargs): callbacks = vi_params.get('callbacks', []) for i, c in enumerate(callbacks): if isinstance(c, CheckParametersConvergence): params = c.__dict__ params.pop('_diff') params.pop('prev') params.pop('ord') params['diff'] = 'absolute' callbacks[i] = CheckParametersConvergence(**params) if sampler == 'variational': with self.model: try: self.trace = pm.sample(chains=2, cores=8, tune=5, draws=5) vi_params['start'] = self.trace[-1] self.trace_vi = pm.fit(**vi_params) self.trace = self.trace_vi.sample(draws=draws) except Exception as e: if hasattr(e, 'message'): message = e.message else: message = e self.logger.error(message) self.trace_vi = None if self.trace_vi is None and self.trace is None: with self.model: self.logger.info( "Error in vi ADVI sampler using Metropolis sampler with draws {}" .format(draws)) self.trace = pm.sample(chains=1, cores=4, tune=20, draws=20, step=pm.NUTS()) elif sampler == 'metropolis': with self.model: start = pm.find_MAP() self.trace = pm.sample(chains=2, cores=8, tune=tune, draws=draws, **kwargs, step=pm.Metropolis(), start=start) else: with self.model: self.trace = pm.sample(chains=2, cores=8, tune=tune, draws=draws, **kwargs, step=pm.NUTS())
def FitMyModel(Y, train, predictor): # with pm.Model() as model: ## [R | Y] tau = pm.HalfNormal('tau', sd=10) sigma = pm.HalfNormal('sigma', sd=10) phi = pm.Uniform('phi', 0, 15) Tau = pm.gp.cov.Constant(tau) cov = (sigma * pm.gp.cov.Matern32(2, phi, active_dims=[0, 1])) + Tau ## Parameters for linear predictor #b0 = pm.Normal('b0',mu=0,sd=10) b = pm.Normal('b', mu=0, sd=10, shape=3) mf = pm.gp.mean.Linear(coeffs=[b]) ## The latent function gp = pm.gp.Latent(cov_func=cov) f = gp.prior("latent_field", X=train[[ 'Longitude', 'Latitude', 'DistanceToRoadMex_mean', 'WorldPopLatam2010_mean', 'vegid' ]].values, reparameterize=False) ## Other model M2 beta_y = pm.Normal("betay", mu=0, sd=10, shape=2) theta = beta_y[0] + beta_y[1] * train.MaxTemperature_mean.values yy = pm.Bernoulli("yy", logit_p=theta, observed=Y.values) #y_obs = pm.Bernoulli('y_obs',logit_p=(f*yy),observed=Y.values) trace = pm.fit(method='advi', callbacks=[CheckParametersConvergence()], n=15000) #trace = pm.sample(10) trace = trace.sample(draws=5000) f_star = gp.conditional( "f_star", predictor['clean'][[ 'Longitude', 'Latitude', 'DistanceToRoadMex', 'WorldPopLatam2010', 'vegid' ]].values) pred_samples = pm.sample_ppc(trace, vars=[f_star], samples=100) return pred_samples
def _fit(self, X, Y, sampler='variational', tune=500, draws=500, vi_params={ "n": 20000, "method": "advi", "callbacks": [CheckParametersConvergence()] }, **kwargs): self.construct_model(X, Y) fit_pymc3_model(self, sampler, draws, tune, vi_params, **kwargs)
def test_choice_function_fixed(trivial_choice_problem, name): np.random.seed(123) # Pytorch does not guarantee full reproducibility in different settings # [1]. This may become a problem in the test suite, in which case we should # increase the tolerance. These are only "sanity checks" on small data sets # anyway and the exact values do not mean much here. # [1] https://pytorch.org/docs/stable/notes/randomness.html torch.manual_seed(123) # Trade off performance for better reproducibility. torch.use_deterministic_algorithms(True) x, y = trivial_choice_problem choice_function = choice_functions[name][0] params, accuracies = choice_functions[name][1], choice_functions[name][2] learner = choice_function(**params) if name == GLM_CHOICE: learner.fit( x, y, vi_params={ "n": 100, "method": "advi", "callbacks": [CheckParametersConvergence()], }, ) else: learner.fit(x, y) s_pred = learner.predict_scores(x) y_pred = learner.predict_for_scores(s_pred) y_pred_2 = learner.predict(x) rtol = 1e-2 atol = 5e-2 assert np.isclose(0.0, subset_01_loss(y_pred, y_pred_2), rtol=rtol, atol=atol, equal_nan=False) for key, value in accuracies.items(): metric = choice_metrics[key] if metric in metrics_on_predictions: pred_loss = metric(y, y_pred) else: pred_loss = metric(y, s_pred) assert np.isclose(value, pred_loss, rtol=rtol, atol=atol, equal_nan=False)
def _fit( self, X, Y, sampler="variational", tune=500, draws=500, vi_params={ "n": 20000, "method": "advi", "callbacks": [CheckParametersConvergence()], }, **kwargs, ): _n_instances, self.n_objects_fit_, self.n_object_features_fit_ = X.shape self.construct_model(X, Y) fit_pymc3_model(self, sampler, draws, tune, vi_params, **kwargs)
def test_discrete_choice_function_fixed(trivial_discrete_choice_problem, name): np.random.seed(123) # There are some caveats with pytorch reproducibility. See the comment on # the corresponding line of `test_choice_functions.py` for details. torch.manual_seed(123) torch.use_deterministic_algorithms(True) x, y = trivial_discrete_choice_problem choice_function = discrete_choice_functions[name][0] params, accuracies = ( discrete_choice_functions[name][1], discrete_choice_functions[name][2], ) learner = choice_function(**params) if name in [MNL, NLM, GEV, PCL, MLM]: learner.fit( x, y, vi_params={ "n": 100, "method": "advi", "callbacks": [CheckParametersConvergence()], }, ) else: learner.fit(x, y) s_pred = learner.predict_scores(x) y_pred = learner.predict_for_scores(s_pred) y_pred_2 = learner.predict(x) rtol = 1e-2 atol = 5e-2 assert np.isclose(0.0, subset_01_loss(y_pred, y_pred_2), rtol=rtol, atol=atol, equal_nan=False) for key, value in accuracies.items(): metric = metrics[key] if metric in metrics_on_predictions: pred_loss = metric(y, y_pred) else: pred_loss = metric(y, s_pred) assert np.isclose(value, pred_loss, rtol=rtol, atol=atol, equal_nan=False)
def test_choice_function_fixed(trivial_choice_problem, name): tf.set_random_seed(0) os.environ["KERAS_BACKEND"] = "tensorflow" np.random.seed(123) x, y = trivial_choice_problem choice_function = choice_functions[name][0] params, accuracies = choice_functions[name][1], choice_functions[name][2] learner = choice_function(**params) if name == GLM_CHOICE: learner.fit( x, y, vi_params={ "n": 100, "method": "advi", "callbacks": [CheckParametersConvergence()], }, ) elif "linear" in name: learner.fit(x, y, epochs=10, validation_split=0, verbose=False) else: learner.fit(x, y, epochs=100, validation_split=0, verbose=False) s_pred = learner.predict_scores(x) y_pred = learner.predict_for_scores(s_pred) y_pred_2 = learner.predict(x) rtol = 1e-2 atol = 5e-2 assert np.isclose(0.0, subset_01_loss(y_pred, y_pred_2), rtol=rtol, atol=atol, equal_nan=False) for key, value in accuracies.items(): metric = choice_metrics[key] if metric in metrics_on_predictions: pred_loss = metric(y, y_pred) else: pred_loss = metric(y, s_pred) assert np.isclose(value, pred_loss, rtol=rtol, atol=atol, equal_nan=False)
def fit_advi_iterative(self, n=3, method='advi', n_type='restart', n_iter=None, learning_rate=None, reducing_lr=False, progressbar=True, scale_cost_to_minibatch=True): """Find posterior using pm.ADVI() method directly (allows continuing training through `refine` method. (maximising likelihood of the data and minimising KL-divergence of posterior to prior - ELBO loss) Parameters ---------- n : number of independent initialisations (Default value = 3) method : advi', to allow for potential use of SVGD, MCMC, custom (currently only ADVI implemented). (Default value = 'advi') n_type : type of repeated initialisation: * **'restart'** to pick different initial value, * **'cv'** for molecular cross-validation - splits counts into n datasets, for now, only n=2 is implemented * **'bootstrap'** for fitting the model to multiple downsampled datasets. Run `mod.bootstrap_data()` to generate variants of data (Default value = 'restart') n_iter : number of iterations, supersedes self.n_iter specified when creating model instance. (Default value = None) learning_rate : learning rate, supersedes self.learning_rate specified when creating model instance. (Default value = None) reducing_lr : boolean, use decaying learning rate? (Default value = False) progressbar : boolean, show progress bar? (Default value = True) scale_cost_to_minibatch : when using training in minibatches, scale cost function appropriately? See discussion https://discourse.pymc.io/t/effects-of-scale-cost-to-minibatch/1429 to understand the effects. (Default value = True) Returns ------- None self.mean_field dictionary with MeanField pymc3 objects, and self.advi dictionary with ADVI objects for each initialisation. """ self.n_type = n_type self.scale_cost_to_minibatch = scale_cost_to_minibatch if n_iter is None: n_iter = self.n_iter if learning_rate is None: learning_rate = self.learning_rate ### Initialise optimiser ### if reducing_lr: # initialise the function for adaptive learning rate s = theano.shared(np.array(learning_rate).astype(self.data_type)) def reduce_rate(a, h, i): s.set_value(np.array(learning_rate / ((i / self.n_obs) + 1) ** .7).astype(self.data_type)) optimiser = pm.adam(learning_rate=s) callbacks = [reduce_rate, CheckParametersConvergence()] else: optimiser = pm.adam(learning_rate=learning_rate) callbacks = [CheckParametersConvergence()] if np.isin(n_type, ['bootstrap']): if self.X_data_sample is None: self.bootstrap_data(n=n) elif np.isin(n_type, ['cv']): self.generate_cv_data() # cv data added to self.X_data_sample init_names = ['init_' + str(i + 1) for i in np.arange(n)] for i, name in enumerate(init_names): with self.model: self.advi[name] = pm.ADVI() # when type is molecular cross-validation or bootstrap, # replace self.x_data tensor with new data if np.isin(n_type, ['cv', 'bootstrap']): # defining minibatch if self.minibatch_size is not None: # minibatch main data - expression matrix self.x_data_minibatch = pm.Minibatch(self.X_data_sample[i].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) more_replacements = {self.x_data: self.x_data_minibatch} # if any other data inputs should be minibatched add them too if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ pm.Minibatch(self.extra_data[k].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) # or using all data else: more_replacements = {self.x_data: self.X_data_sample[i].astype(self.data_type)} # if any other data inputs should be added if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ self.extra_data[k].astype(self.data_type) else: # defining minibatch if self.minibatch_size is not None: # minibatch main data - expression matrix self.x_data_minibatch = pm.Minibatch(self.X_data.astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) more_replacements = {self.x_data: self.x_data_minibatch} # if any other data inputs should be minibatched add them too if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ pm.Minibatch(self.extra_data[k].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) else: more_replacements = {} self.advi[name].scale_cost_to_minibatch = scale_cost_to_minibatch # train the model self.mean_field[name] = self.advi[name].fit(n_iter, callbacks=callbacks, obj_optimizer=optimiser, total_grad_norm_constraint=self.total_grad_norm_constraint, progressbar=progressbar, more_replacements=more_replacements) # plot training history if self.verbose: print(plt.plot(np.log10(self.mean_field[name].hist[15000:])));
def fit_advi_refine(self, n_iter=10000, learning_rate=None, progressbar=True, reducing_lr=False): """Refine posterior using ADVI - continue training after `.fit_advi_iterative()` Parameters ---------- n_iter : number of additional iterations (Default value = 10000) learning_rate : same as in `.fit_advi_iterative()` (Default value = None) progressbar : same as in `.fit_advi_iterative()` (Default value = True) reducing_lr : same as in `.fit_advi_iterative()` (Default value = False) Returns ------- dict update the self.mean_field dictionary with MeanField pymc3 objects. """ self.n_iter = self.n_iter + n_iter if learning_rate is None: learning_rate = self.learning_rate ### Initialise optimiser ### if reducing_lr: # initialise the function for adaptive learning rate s = theano.shared(np.array(learning_rate).astype(self.data_type)) def reduce_rate(a, h, i): s.set_value(np.array(learning_rate / ((i / self.n_obs) + 1) ** .7).astype(self.data_type)) optimiser = pm.adam(learning_rate=s) callbacks = [reduce_rate, CheckParametersConvergence()] else: optimiser = pm.adam(learning_rate=learning_rate) callbacks = [CheckParametersConvergence()] for i, name in enumerate(self.advi.keys()): # when type is molecular cross-validation or bootstrap, # replace self.x_data tensor with new data if np.isin(self.n_type, ['cv', 'bootstrap']): # defining minibatch if self.minibatch_size is not None: # minibatch main data - expression matrix self.x_data_minibatch = pm.Minibatch(self.X_data_sample[i].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) more_replacements = {self.x_data: self.x_data_minibatch} # if any other data inputs should be minibatched add them too if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ pm.Minibatch(self.extra_data[k].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) # or using all data else: more_replacements = {self.x_data: self.X_data_sample[i].astype(self.data_type)} # if any other data inputs should be added if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ self.extra_data[k].astype(self.data_type) else: # defining minibatch if self.minibatch_size is not None: # minibatch main data - expression matrix self.x_data_minibatch = pm.Minibatch(self.X_data.astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) more_replacements = {self.x_data: self.x_data_minibatch} # if any other data inputs should be minibatched add them too if self.extra_data is not None: # for each parameter in the dictionary add it to more_replacements for k in self.extra_data.keys(): more_replacements[self.extra_data_tt[k]] = \ pm.Minibatch(self.extra_data[k].astype(self.data_type), batch_size=[self.minibatch_size, None], random_seed=self.minibatch_seed[i]) else: more_replacements = {} with self.model: # train for more iterations & export trained model by overwriting the initial mean field object self.mean_field[name] = self.advi[name].fit(n_iter, callbacks=callbacks, obj_optimizer=optimiser, total_grad_norm_constraint=self.total_grad_norm_constraint, progressbar=progressbar, more_replacements=more_replacements) if self.verbose: print(plt.plot(np.log10(self.mean_field[name].hist[15000:])))
def fitbayesianmodel(bayesian_model, ytrain, method=1, n_=3000, MAP=True, chains=1, jobs=1, star='rrlyr', classifier='RL', PCA=False): print('chains: ', chains) print('jobs: ', jobs) if method == 4: print('------- Slice Sampling--------') with bayesian_model as model: map = 0 step = pm.Slice() trace = pm.sample(n_, step=step, njobs=jobs) return trace, model, map if method == 5: print('------- HamiltonianMC--------') with bayesian_model as model: step = pm.HamiltonianMC() trace = pm.sample(n_, chain=chains, tune=2000, njobs=jobs, step=step, init=None) return trace, model, map if method == 6: print('------- Default--------') with bayesian_model as model: map = 0 trace = pm.sample(n_, chain=chains, njobs=jobs, callbacks=[CheckParametersConvergence()]) return trace, model, map if method == 7: print('------- Metropolis--------') with bayesian_model as model: map = 0 step = pm.Metropolis() trace = pm.sample(n_, step=step, chain=chains, njobs=jobs, callbacks=[CheckParametersConvergence()], tune=1000, step_size=100) pm.traceplot(trace) name = star + '_' + classifier + '_PCA_' + str(PCA) + '2.png' plt.savefig(name) plt.clf() return trace, model, map if method == 8: print('------- NUTS--------') with bayesian_model as model: stds = np.ones(model.ndim) for _ in range(5): args = {'is_cov': True} trace = pm.sample(500, tune=1000, chains=1, init='advi+adapt_diag_grad', nuts_kwargs=args) samples = [model.dict_to_array(p) for p in trace] stds = np.array(samples).std(axis=0) traces = [] for i in range(1): step = pm.NUTS(scaling=stds**2, is_cov=True, target_accept=0.8) # start = trace[-10 * i] trace_ = pm.sample(n_, cores=4, step=step, tune=1000, chain=chains, njobs=1, init='advi+adapt_diag_grad', start=start, callbacks=[CheckParametersConvergence()]) trace = trace_ map = 0 return trace, model, map
btc[daypart, cool_temp_cluster_idx] * cooling_temp + bth[daypart, heat_temp_cluster_idx] * heating_temp # Model error: sigma = pm.Exponential("sigma", 1.0) y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims='obs_id') # Fitting without sampling with partial_pooling: approx = pm.fit(n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) partial_pooling_trace = approx.sample(1000) partial_pooling_idata = az.from_pymc3(partial_pooling_trace) # Sampling from the posterior setting test data to check the predictions on unseen data with partial_pooling: pm.set_data({ "profile_cluster_idx": clusters[test_index], "heat_temp_cluster_idx": heat_clusters[test_index], "cool_temp_cluster_idx": cool_clusters[test_index], "daypart": dayparts[test_index], "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index],
# Print summaries and traceplots for the means, σ's and probabilities. # Number of iterations for ADVI fit num_iters: int = 50000 # Fit the model using ADVI # Tried to fit using FullRankADVI as well; results were horrible try: advi = vartbl['advi'] print(f'Loaded ADVI fit for Gaussian Mixture Model.') except: print(f'Running ADVI fit for Gaussian Mixture Model...') advi = pm.ADVI(model=model) advi.fit(n=num_iters, obj_optimizer=pm.adam(), callbacks=[CheckParametersConvergence()]) vartbl['advi'] = advi save_vartbl(vartbl, fname) def plot_elbo(elbo, plot_step, title): """Generate the ELBO plot""" fig, ax = plt.subplots(figsize=[12, 8]) ax.set_title(title) ax.set_xlabel('Iteration') ax.set_ylabel('ELBO') n = len(elbo) plot_x = np.arange(0, n, plot_step) plot_y = elbo[::plot_step] ax.plot(plot_x, plot_y, color='b') ax.grid()
def fit( self, X, Y, sampler="variational", tune=500, draws=500, tune_size=0.1, thin_thresholds=1, vi_params={ "n": 20000, "method": "advi", "callbacks": [CheckParametersConvergence()], }, verbose=0, **kwargs): """ Fit a generalized logit model on the provided set of queries X and choices Y of those objects. The provided queries and corresponding preferences are of a fixed size (numpy arrays). For learning this network the binary cross entropy loss function for each object :math:`x_i \\in Q` is defined as: .. math:: C_{i} = -y(i)\\log(P_i) - (1 - y(i))\\log(1 - P_i) \\enspace, where :math:`y` is ground-truth choice vector of the objects in the given query set :math:`Q`. The value :math:`y(i) = 1` if object :math:`x_i` is chosen else :math:`y(i) = 0`. Parameters ---------- X : numpy array (n_instances, n_objects, n_features) Feature vectors of the objects Y : numpy array (n_instances, n_objects) Choices for given objects in the query sampler : {‘variational’, ‘metropolis’, ‘nuts’}, string The sampler used to estimate the posterior mean and mass matrix from the trace * **variational** : Run inference methods to estimate posterior mean and diagonal mass matrix * **metropolis** : Use the MAP as starting point and Metropolis-Hastings sampler * **nuts** : Use the No-U-Turn sampler vi_params : dict The parameters for the **variational** inference method draws : int The number of samples to draw. Defaults to 500. The number of tuned samples are discarded by default tune : int Number of iterations to tune, defaults to 500. Ignored when using 'SMC'. Samplers adjust the step sizes, scalings or similar during tuning. Tuning samples will be drawn in addition to the number specified in the `draws` argument, and will be discarded unless `discard_tuned_samples` is set to False. tune_size: float (range : [0,1]) Percentage of instances to split off to tune the threshold for the choice function thin_thresholds: int The number of instances of scores to skip while tuning the threshold verbose : bool Print verbose information **kwargs : Keyword arguments for the fit function """ if tune_size > 0: X_train, X_val, Y_train, Y_val = train_test_split( X, Y, test_size=tune_size, random_state=self.random_state) try: self._fit(X_train, Y_train, sampler=sampler, vi_params=vi_params, **kwargs) finally: self.logger.info( "Fitting utility function finished. Start tuning threshold." ) self.threshold = self._tune_threshold( X_val, Y_val, thin_thresholds=thin_thresholds, verbose=verbose) else: self._fit(X, Y, sampler=sampler, sample_params={ "tune": 2, "draws": 2, "chains": 4, "njobs": 8 }, vi_params={ "n": 20000, "method": "advi", "callbacks": [ pm.callbacks.CheckParametersConvergence( diff="absolute", tolerance=0.01, every=50) ], "draws": 500, }, **kwargs) self.threshold = 0.5
def bayesian_model_comparison(df): # Preprocess df["log_v"] = log_electricity = np.log(df["total_electricity"]).values total_electricity = df.total_electricity.values # Create local variables (assign daypart, cluster and weekday values need to start from 0) # clusters are use profile categories, heat_clusters and cool_clusters indicate days having similar # temperature dependence (likely to modify this in the new version of the preprocessing) df.t = pd.to_datetime(pd.Series(df.t)) df.s = df.s - 1 df.weekday = df.weekday - 1 clusters = df.s unique_clusters = clusters.unique() dayparts = df.daypart weekdays = df.weekday unique_dayparts = dayparts.unique() unique_weekdays = weekdays.unique() n_hours = len(df.index) outdoor_temp_c = df.outdoor_temp_c outdoor_temp_h = df.outdoor_temp_h outdoor_temp_lp_c = df.outdoor_temp_lp_c outdoor_temp_lp_h = df.outdoor_temp_lp_h daypart_fs_sin_1 = df.daypart_fs_sin_1 daypart_fs_sin_2 = df.daypart_fs_sin_2 daypart_fs_sin_3 = df.daypart_fs_sin_3 daypart_fs_cos_1 = df.daypart_fs_cos_1 daypart_fs_cos_2 = df.daypart_fs_cos_2 daypart_fs_cos_3 = df.daypart_fs_cos_3 # create coords for pymc3 coords = {"obs_id": np.arange(total_electricity.size)} coords["profile_cluster"] = unique_clusters coords["daypart"] = unique_dayparts coords["weekday"] = unique_weekdays # Create kfold cross-validation splits kf = KFold(n_splits=5) kf.get_n_splits(df) # Create arrays to save model results partial_pool_cvrmse_list = [] no_pool_cvrmse_list = [] complete_pool_cvrmse_list = [] partial_pool_coverage_list = [] no_pool_coverage_list = [] complete_pool_coverage_list = [] for train_index, test_index in kf.split(df): coords = {"obs_id": np.arange(total_electricity[train_index].size)} coords["profile_cluster"] = unique_clusters coords["daypart"] = unique_dayparts coords["weekday"] = unique_weekdays # Partial Pooling with pm.Model(coords=coords) as partial_pooling: profile_cluster_idx = pm.Data("profile_cluster_idx", clusters[train_index], dims="obs_id") daypart = pm.Data("daypart", dayparts[train_index], dims="obs_id") weekday = pm.Data("weekday", weekdays[train_index], dims="obs_id") fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Hyperpriors: bf = pm.Normal("bf", mu=0.0, sigma=1.0) sigma_bf = pm.Exponential("sigma_bf", 1.0) a = pm.Normal("a", mu=0.0, sigma=1.0) sigma_a = pm.Exponential("sigma_a", 1.0) # btc = pm.Normal("btc", mu=0.0, sigma=1.0, dims="daypart") # bth = pm.Normal("bth", mu=0.0, sigma=1.0, dims="daypart") btclp = pm.Normal("btclp", mu=0.0, sigma=1.0, dims="daypart") bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0, dims="daypart") # Varying intercepts a_cluster = pm.Normal("a_cluster", mu=a, sigma=sigma_a, dims=("daypart", "profile_cluster")) # Varying slopes: bs1 = pm.Normal("bs1", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bs2 = pm.Normal("bs2", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bs3 = pm.Normal("bs3", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc1 = pm.Normal("bc1", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc2 = pm.Normal("bc2", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc3 = pm.Normal("bc3", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) # Expected value per county: mu = a_cluster[daypart, profile_cluster_idx] + bs1[profile_cluster_idx] * fs_sin_1 + \ bs2[profile_cluster_idx] * fs_sin_2 + bs3[profile_cluster_idx] * fs_sin_3 + \ bc1[profile_cluster_idx] * fs_cos_1 + bc2[profile_cluster_idx] * fs_cos_2 + \ bc3[profile_cluster_idx] * fs_cos_3 + \ btclp[daypart] * cooling_temp_lp + \ bthlp[daypart] * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with partial_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) partial_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with partial_pooling: pm.set_data({ "profile_cluster_idx": clusters[test_index], "daypart": dayparts[test_index], # "weekday":weekdays, "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) partial_pool_posterior_hdi = pm.sample_posterior_predictive( partial_pooling_trace, keep_size=True) partial_pool_posterior = pm.sample_posterior_predictive( partial_pooling_trace) partial_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI partial_pool_predictions = np.exp(partial_pool_posterior['y'].mean(0)) hdi_data = az.hdi(partial_pool_posterior_hdi) partial_pool_lower_bound = np.array( np.exp(hdi_data.to_array().sel(hdi='lower'))).flatten() partial_pool_higher_bound = np.array( np.exp(hdi_data.to_array().sel(hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI partial_pool_mse = mean_squared_error(df.total_electricity[test_index], partial_pool_predictions) partial_pool_rmse = sqrt(partial_pool_mse) partial_pool_cvrmse = partial_pool_rmse / df.total_electricity.mean() partial_pool_coverage = sum( (partial_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= partial_pool_higher_bound) ) * 100 / len(test_index) partial_pool_cvrmse_list.append(partial_pool_cvrmse) partial_pool_coverage_list.append(partial_pool_coverage) # No Pooling with pm.Model(coords=coords) as no_pooling: profile_cluster_idx = pm.Data("profile_cluster_idx", clusters[train_index], dims="obs_id") daypart = pm.Data("daypart", dayparts[train_index], dims="obs_id") weekday = pm.Data("weekday", weekdays[train_index], dims="obs_id") fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Priors: a_cluster = pm.Normal("a_cluster", mu=0.0, sigma=1.0, dims=("daypart", "profile_cluster")) btclp = pm.Normal("btclp", mu=0.0, sigma=1.0, dims="daypart") bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0, dims="daypart") bs1 = pm.Normal("bs1", mu=0.0, sigma=1.0, dims="profile_cluster") bs2 = pm.Normal("bs2", mu=0.0, sigma=1.0, dims="profile_cluster") bs3 = pm.Normal("bs3", mu=0.0, sigma=1.0, dims="profile_cluster") bc1 = pm.Normal("bc1", mu=0.0, sigma=1.0, dims="profile_cluster") bc2 = pm.Normal("bc2", mu=0.0, sigma=1.0, dims="profile_cluster") bc3 = pm.Normal("bc3", mu=0.0, sigma=1.0, dims="profile_cluster") # Expected value per county: mu = a_cluster[daypart, profile_cluster_idx] + bs1[profile_cluster_idx] * fs_sin_1 + \ bs2[profile_cluster_idx] * fs_sin_2 + bs3[profile_cluster_idx] * fs_sin_3 + \ bc1[profile_cluster_idx] * fs_cos_1 + bc2[profile_cluster_idx] * fs_cos_2 + \ bc3[profile_cluster_idx] * fs_cos_3 + \ btclp[daypart] * cooling_temp_lp + \ bthlp[daypart] * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with no_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) no_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with no_pooling: pm.set_data({ "profile_cluster_idx": clusters[test_index], "daypart": dayparts[test_index], # "weekday":weekdays, "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) no_pool_posterior_hdi = pm.sample_posterior_predictive( no_pooling_trace, keep_size=True) no_pool_posterior = pm.sample_posterior_predictive( no_pooling_trace) no_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI no_pool_predictions = np.exp(no_pool_posterior['y'].mean(0)) no_pool_hdi_data = az.hdi(no_pool_posterior_hdi) no_pool_lower_bound = np.array( np.exp(no_pool_hdi_data.to_array().sel(hdi='lower'))).flatten() no_pool_higher_bound = np.array( np.exp(no_pool_hdi_data.to_array().sel(hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI no_pool_mse = mean_squared_error(df.total_electricity[test_index], no_pool_predictions) no_pool_rmse = sqrt(no_pool_mse) no_pool_cvrmse = no_pool_rmse / df.total_electricity.mean() no_pool_coverage = sum( (no_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= no_pool_higher_bound) ) * 100 / len(test_index) no_pool_cvrmse_list.append(no_pool_cvrmse) no_pool_coverage_list.append(no_pool_coverage) # Complete pooling with pm.Model(coords=coords) as complete_pooling: fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Priors: a = pm.Normal("a", mu=0.0, sigma=1.0) btclp = pm.Normal("btclp", mu=0.0, sigma=1.0) bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0) bs1 = pm.Normal("bs1", mu=0.0, sigma=1.0) bs2 = pm.Normal("bs2", mu=0.0, sigma=1.0) bs3 = pm.Normal("bs3", mu=0.0, sigma=1.0) bc1 = pm.Normal("bc1", mu=0.0, sigma=1.0) bc2 = pm.Normal("bc2", mu=0.0, sigma=1.0) bc3 = pm.Normal("bc3", mu=0.0, sigma=1.0) # Expected value per county: mu = a + bs1 * fs_sin_1 + bs2 * fs_sin_2 + bs3 * fs_sin_3 + bc1 * fs_cos_1 + bc2 * fs_cos_2 + \ bc3 * fs_cos_3 + btclp * cooling_temp_lp + bthlp * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with complete_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) complete_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with complete_pooling: pm.set_data({ "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) complete_pool_posterior_hdi = pm.sample_posterior_predictive( complete_pooling_trace, keep_size=True) complete_pool_posterior = pm.sample_posterior_predictive( complete_pooling_trace) complete_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI complete_pool_predictions = np.exp( complete_pool_posterior['y'].mean(0)) complete_pool_hdi_data = az.hdi(complete_pool_posterior_hdi) complete_pool_lower_bound = np.array( np.exp( complete_pool_hdi_data.to_array().sel(hdi='lower'))).flatten() complete_pool_higher_bound = np.array( np.exp(complete_pool_hdi_data.to_array().sel( hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI complete_pool_mse = mean_squared_error( df.total_electricity[test_index], complete_pool_predictions) complete_pool_rmse = sqrt(complete_pool_mse) complete_pool_cvrmse = complete_pool_rmse / df.total_electricity.mean() complete_pool_coverage = sum( (complete_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= complete_pool_higher_bound) ) * 100 / len(test_index) complete_pool_cvrmse_list.append(complete_pool_cvrmse) complete_pool_coverage_list.append(complete_pool_coverage) # Export Results np_cvrmse = np.mean(no_pool_cvrmse_list) cp_cvrmse = np.mean(complete_pool_cvrmse_list) pp_cvrmse = np.mean(partial_pool_cvrmse_list) np_coverage = np.mean(no_pool_coverage_list) cp_coverage = np.mean(complete_pool_coverage_list) pp_coverage = np.mean(partial_pool_coverage_list) export_data = { 'partial_pooling_cvrmse': [pp_cvrmse], 'no_pooling_cvrmse': [np_cvrmse], 'complete_pooling_cvrmse': [cp_cvrmse], 'partial_pooling_coverage': [pp_coverage], 'no_pooling_coverage': [np_coverage], 'complete_pooling_coverage': [cp_coverage] } export_df = pd.DataFrame(data=export_data) return export_df