def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = self.precision[:] self.mu_prior = np.zeros((self.latent_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = self.precision[:] self.mu_prior = np.zeros((self.latent_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def calc_precision_prior(self, contexts): precisions_return = [] n, m = contexts.shape prior = (0.01) * np.eye(self.latent_dim) if self.cov is not None: for action, precision in enumerate(self.cov): ind = np.array( [i for i in range(n) if self.data_h.actions[i] == action]) if len(ind) > 0: """compute confidence scores for old data""" d = [] for c in self.latent_h.contexts[ind, :]: d.append(np.dot(np.dot(c, precision), c.T)) """compute new data correlations""" phi = [] for c in contexts[ind, :]: phi.append(np.outer(c, c)) X = cvx.Variable((m, m), PSD=True) # Form objective. obj = cvx.Minimize( sum([(cvx.trace(X * phi[i]) - d[i])**2 for i in xrange(len(d))])) prob = cvx.Problem(obj) prob.solve() if X.value is None: precisions_return.append(np.linalg.inv(prior)) else: precisions_return.append(np.linalg.inv(X.value + prior)) else: precisions_return.append(np.linalg.inv(prior)) return precisions_return def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run( self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) # Update the confidence prior using feature uncertainty matching if self.sigma_prior_flag == 1: self.precision_prior = self.calc_precision_prior( contexts=new_z) self.latent_h.replace_data(contexts=new_z) # Update the mean prior using the weights of the NN if self.mu_prior_flag == 1: self.mu_prior = self.bnn.get_mu_prior() # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Get priors sigma0 = self.precision_prior[action_v] mu_0 = self.mu_prior[:, action_v] # Calc mean and precision using bayesian linear regression precision_a = s + sigma0 cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0))) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0)) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a #self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralLinearPosteriorSamplingOnline(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, textflag='no', optimizer='RMS'): self.first_train = False self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False self.pgd_steps = self.hparams.pgd_steps self.pgd_batch_size = self.hparams.pgd_batch_size if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim self.EPSILON = 0.00001 # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.before = [] self.after = [] self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior = np.zeros((self.param_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept, buffer_s=hparams.mem) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def calc_precision_prior(self, contexts): precisions_return = [] n, m = contexts.shape prior = (self.EPSILON) * np.eye(self.param_dim) if self.cov is not None: for action, cov in enumerate(self.cov): ind = np.array( [i for i in range(n) if self.data_h.actions[i] == action]) if len(ind) > 0: """compute confidence scores for old data""" d = [] for c in self.latent_h.contexts[ind, :]: d.append(np.dot(np.dot(c, cov), c.T)) d = np.array(d) """compute new data correlations""" phi = [] for c in contexts[ind, :]: phi.append(np.outer(c, c)) phi = np.array(phi) X = prior #cov alpha = 1.0 for t in range(self.pgd_steps): alpha = alpha / (t + 1) batch_ind = np.random.choice(len(ind), self.pgd_batch_size) X_batch = np.tile(X[np.newaxis], [self.pgd_batch_size, 1, 1]) diff = np.sum(X_batch * phi[batch_ind], (1, 2)) - d[batch_ind] diff = np.reshape(diff, (-1, 1, 1)) grad = 2.0 * phi[batch_ind] * diff grad = np.sum(grad, 0) X = X - alpha * grad #project X into PSD space w, v = np.linalg.eigh(X) neg_values = [w < 0.0] w[neg_values] = 0.0 #thresholding X = (v * w).dot(v.T) if X is None: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior else: precisions_return.append(np.linalg.inv(X + prior)) self.cov[action] = X + prior else: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior return (precisions_return) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) cov_prior = (self.EPSILON) * np.eye(self.param_dim) if self.t % self.update_freq_nn == 0 and self.t >= self.hparams.batch_size: # THIS SHOULD BE ON ONLY WHEN NOT ONLINE: # if self.hparams.reset_lr: # self.bnn.assign_lr() # self.bnn.train(self.data_h, self.num_epochs) self.precision_prior, self.precision, self.cov, self.f = self.bnn.train_online( self.data_h, self.num_epochs, self.cov, cov_prior, self.pgd_steps, self.precision_prior, pgd_freq=self.hparams.pgd_freq, sig_prior=self.sigma_prior_flag) if self.mu_prior_flag == 1: weights_p, bias_p = self.bnn.get_mu_prior() self.mu_prior[:self.latent_dim] = weights_p self.mu_prior[-1] = bias_p else: # Retrain the network on the original data (data_h) if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) self.precision[action] += np.dot(z_context.T, z_context) self.cov[action] = np.linalg.inv(self.precision[action] + self.precision_prior[action]) self.f[action] += (z_context.T * reward)[:, 0] # Calc mean and precision using bayesian linear regression self.mu[action] = np.dot( self.cov[action], (self.f[action] + np.dot(self.precision_prior[action], self.mu_prior[:, action]))) # Inverse Gamma posterior update self.yy[action] += reward**2 self.a[action] += 0.5 b_upd = 0.5 * self.yy[action] b_upd += 0.5 * np.dot( self.mu_prior[:, action].T, np.dot(self.precision_prior[action], self.mu_prior[:, action])) b_upd -= 0.5 * np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])) self.b[action] = self.b0 + b_upd # def update_new(self, context, action, reward): # """Updates the posterior using linear bayesian regression formula.""" # # self.t += 1 # self.data_h.add(context, action, reward) # c = context.reshape((1, self.hparams.context_dim)) # z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # # self.latent_h.add(z_context, action, reward) # # if self.intercept: # z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1)) # self.precision[action] += np.dot(z_context.T, z_context) # self.cov[action] = np.linalg.inv(self.precision[action]) # self.f[action] += (z_context.T * reward)[:, 0] # # # if self.hparams.reset_lr: # # self.bnn.assign_lr() # # self.precision_prior = self.bnn.train_online(self.data_h, self.num_epochs, self.cov, self.pgd_steps, self.hparams.pgd_lr, self.hparams.pgd_freq) # # # # Update the latent representation of every datapoint collected so far # # new_z = self.bnn.sess.run(self.bnn.nn, # feed_dict={self.bnn.x: self.data_h.contexts}) # # i_contexts = None # for context in new_z: # c = np.array(context[:]) # if self.intercept: # c = np.append(c, 1.0).reshape((1, self.latent_dim + 1)) # if i_contexts is None: # i_contexts = c # else: # i_contexts = np.vstack((i_contexts, c)) # # # # # Update the confidence prior using feature uncertainty matching # # #self.before.append(self.calc_model_evidence()) # # if self.sigma_prior_flag==1: # self.precision_prior = self.calc_precision_prior(contexts=i_contexts) # # Update the mean prior using the weights of the NN # if self.mu_prior_flag == 1: # weights_p, bias_p = self.bnn.get_mu_prior() # self.mu_prior[:self.latent_dim] = weights_p # self.mu_prior[-1] = bias_p # #self.after.append(self.calc_model_evidence()) # #print(self.before) # #print(self.after) # # self.latent_h.replace_data(contexts=new_z) # # Update the Bayesian Linear Regression # # for action_v in range(self.hparams.num_actions): # # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) # z, y = self.latent_h.get_data(action_v) # # # The algorithm could be improved with sequential formulas (cheaper) # self.precision[action_v] = (np.dot(z.T, z) + self.precision_prior[action_v]) # self.f[action_v] = np.dot(z.T, y) # # else: # # # # # Calc mean and precision using bayesian linear regression # self.mu[action] = np.dot(self.cov[action], (self.f[action]+np.dot(self.precision_prior[action],self.mu_prior[:,action]))) # # # Inverse Gamma posterior update # self.yy[action] += reward ** 2 # # self.a[action] += 0.5 # b_upd = 0.5 * self.yy[action] # b_upd += 0.5 * np.dot(self.mu_prior[:,action].T, np.dot(self.precision_prior[action], self.mu_prior[:,action])) # b_upd -= 0.5 * np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])) # self.b[action] = self.b0 + b_upd @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 for action in range(self.hparams.num_actions): sigma0 = self.precision_prior[action] mu_0 = self.mu_prior[:, action] z, y = self.latent_h.get_data(action) n = z.shape[0] s = np.dot(z.T, z) s_n = (sigma0 + s) cov_a = np.linalg.inv(s_n) mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0))) a_post = (self.a0 + n / 2.0) b_upd = 0.5 * np.dot(y.T, y) b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0)) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(s_n, mu_a)) b_post = self.b0 + b_upd val = np.float128(1) val /= ((np.float128(2.0) * math.pi)**(n / 2.0)) val *= (gamma(a_post) / gamma(self.a0)) val *= np.sqrt(np.linalg.det(sigma0) / np.linalg.det(s_n)) val *= ((self.hparams.b0**self.hparams.a0) / (b_post**a_post)) vval += val vval /= self.hparams.num_actions return vval
def __init__(self, name, hparams, textflag='no', optimizer='RMS'): self.first_train = False self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False self.pgd_steps = self.hparams.pgd_steps self.pgd_batch_size = self.hparams.pgd_batch_size if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim self.EPSILON = 0.00001 # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.before = [] self.after = [] self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior = np.zeros((self.param_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept, buffer_s=hparams.mem) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, textflag='no', optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim self.EPSILON = 0.00001 # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.before = [] self.after = [] self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = self.precision[:] self.mu_prior = np.zeros((self.param_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept, buffer_s=hparams.mem) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def calc_precision_prior(self, contexts): precisions_return = [] n, m = contexts.shape prior = (self.EPSILON) * np.eye(self.param_dim) if self.cov is not None: for action, precision in enumerate(self.cov): ind = np.array( [i for i in range(n) if self.data_h.actions[i] == action]) if len(ind) > 0: """compute confidence scores for old data""" d = [] for c in self.latent_h.contexts[ind, :]: d.append(np.dot(np.dot(c, precision), c.T)) """compute new data correlations""" phi = [] for c in contexts[ind, :]: phi.append(np.outer(c, c)) X = cvx.Variable((m, m), PSD=True) # Form objective. obj = cvx.Minimize( sum([(cvx.trace(X * phi[i]) - d[i])**2 for i in xrange(len(d))])) prob = cvx.Problem(obj) prob.solve() if X.value is None: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior else: precisions_return.append(np.linalg.inv(X.value + prior)) self.cov[action] = X.value + prior else: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior return (precisions_return) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run( self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) i_contexts = None for context in new_z: c = np.array(context[:]) if self.intercept: c = np.append(c, 1.0).reshape((1, self.latent_dim + 1)) if i_contexts is None: i_contexts = c else: i_contexts = np.vstack((i_contexts, c)) # Update the confidence prior using feature uncertainty matching #self.before.append(self.calc_model_evidence()) if self.sigma_prior_flag == 1: self.precision_prior = self.calc_precision_prior( contexts=i_contexts) # Update the mean prior using the weights of the NN if self.mu_prior_flag == 1: weights_p, bias_p = self.bnn.get_mu_prior() self.mu_prior[:self.latent_dim] = weights_p self.mu_prior[-1] = bias_p #self.after.append(self.calc_model_evidence()) #print(self.before) #print(self.after) # Update the Bayesian Linear Regression for action_v in xrange(self.hparams.num_actions): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) self.precision[action_v] = (np.dot(z.T, z) + self.precision_prior[action_v]) self.f[action_v] = np.dot(z.T, y) else: if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) self.precision[action] += np.dot(z_context.T, z_context) self.cov[action] = np.linalg.inv(self.precision[action]) self.f[action] += (z_context.T * reward)[:, 0] # Calc mean and precision using bayesian linear regression self.mu[action] = np.dot( self.cov[action], (self.f[action] + np.dot(self.precision_prior[action], self.mu_prior[:, action]))) # Inverse Gamma posterior update self.yy[action] += reward**2 self.a[action] += 0.5 b_upd = 0.5 * self.yy[action] b_upd += 0.5 * np.dot( self.mu_prior[:, action].T, np.dot(self.precision_prior[action], self.mu_prior[:, action])) b_upd -= 0.5 * np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])) self.b[action] = self.b0 + b_upd @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 for action in xrange(self.hparams.num_actions): sigma0 = self.precision_prior[action] mu_0 = self.mu_prior[:, action] z, y = self.latent_h.get_data(action) n = z.shape[0] s = np.dot(z.T, z) s_n = (sigma0 + s) cov_a = np.linalg.inv(s_n) mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0))) a_post = (self.a0 + n / 2.0) b_upd = 0.5 * np.dot(y.T, y) b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0)) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(s_n, mu_a)) b_post = self.b0 + b_upd val = np.float128(1) val /= ((np.float128(2.0) * math.pi)**(n / 2.0)) val *= (gamma(a_post) / gamma(self.a0)) val *= np.sqrt(np.linalg.det(sigma0) / np.linalg.det(s_n)) val *= ((self.hparams.b0**self.hparams.a0) / (b_post**a_post)) vval += val vval /= self.hparams.num_actions return vval