def update_optimal_actions(self): """ :return: dictionary of informations about optimal action for each posterior sample of the model parameters # keys: actions = sorted tuple of items to propose in the assortment # values: (p(action = a*), [thetas such that action is optimal for theta] """ posteriors_actions = act_optimally(self.posterior_belief, self.assortment_size) posteriors_actions = [ tuple(posteriors_actions[ix, :]) for ix in range(self.n_samples) ] optimal_actions_information = defaultdict(list) for ix, action in enumerate(posteriors_actions): optimal_actions_information[action].append(ix) self.optimal_actions = { action: (len(theta_idxs) / self.n_samples, theta_idxs) for action, theta_idxs in optimal_actions_information.items() } self.actions_star = np.array( [list(key) for key in optimal_actions_information.keys()]) self.counts_star = np.array( [len(val) for val in optimal_actions_information.values()]) self.thetas_star = [] for val in optimal_actions_information.values(): self.thetas_star += val self.thetas_star = np.array(self.thetas_star) self.a_star_entropy = sum([ -p * np.log(p) for (action, (p, _)) in self.optimal_actions.items() if p > 0.0 ]) self.a_star_entropy = (self.max_entropy * self.a_star_entropy / self.max_s_entropy)
def act(self): posterior_belief = self.sample_from_posterior(n_samples=1) action = act_optimally(np.squeeze(posterior_belief), top_k=self.assortment_size) self.current_action = action assert 0 in action if (not self.sampling) else True assert (self.top_item_index in action if self.top_item_index is not None else True) return action
def action_selection(self): if self.top_item_index is None: fallback_taken, n_new = self.optimal_ids_action_parameters() action = self.sample_from_params(fallback_taken, n_new) return action else: return act_optimally( np.squeeze(self.sample_from_posterior(1)), top_k=self.assortment_size, )
def update_r_star(self): sorted_beliefs = np.sort( self.posterior_belief, axis=1)[:, -self.assortment_size:] # shape (m, k) picking_probabilities = sorted_beliefs.sum(1) if self.dynamics == "epoch": self.r_star = picking_probabilities.mean() else: self.r_star = (picking_probabilities / (1 + picking_probabilities)).mean() a_greedy = act_optimally(self.posterior_belief.mean(0), self.assortment_size) greedy_expected_reward = numba_expected_reward(self.posterior_belief, a_greedy, mode=self.dynamics) self.delta_min = self.r_star - greedy_expected_reward assert self.delta_min > -1e-12, ( self.delta_min, self.r_star, greedy_expected_reward, ) self.delta_min = max(1e-12, self.delta_min)
def proposal(self): posterior_belief = self.sample_from_posterior(1) action = act_optimally(np.squeeze(posterior_belief), top_k=self.assortment_size) self.current_action = action return action
def proposal(self): # expected_rewards, stds = params_to_gaussian(self.posterior_parameters) # expected_rewards = np.minimum(expected_rewards, 1.0) posterior_belief = self.sample_from_posterior(self.n_samples) sorted_beliefs = np.sort(posterior_belief, axis=1) thresholds = sorted_beliefs[:, -self.assortment_size].reshape(-1, 1) best_actions = sorted_beliefs[:, -self.assortment_size:] sum_rewards_best = best_actions.sum(1) r_star = sum_rewards_best.mean() expected_rewards = posterior_belief.mean(0) # min_rew = expected_rewards.min() / 1e5 # expected_rewards += np.random.rand(expected_rewards.shape[0]) * min_rew mask = posterior_belief >= thresholds p_star = mask.sum(0) / mask.shape[0] if_star = (posterior_belief * mask).sum(0) / (mask.sum(0) + 1e-12) # else_star = (posterior_belief * (1 - mask)).sum(0) / ( # (1 - mask).sum(0) + 1e-12 # ) # variances = ( # p_star * (if_star - expected_rewards) ** 2 # + (1 - p_star) * (else_star - expected_rewards) ** 2 # ) variances = p_star * (if_star - expected_rewards)**2 # posterior_belief = self.sample_from_posterior(self.n_samples) # sorted_beliefs = np.sort(posterior_belief, axis=1) # thresholds = sorted_beliefs[:, -self.assortment_size].reshape(-1, 1) # mask = posterior_belief >= thresholds # p_star = mask.sum(0) / mask.shape[0] # variances *= p_star variances = np.maximum(variances, 1e-12) # a_star_t = np.sort(expected_rewards)[-self.assortment_size] # a_s = self.posterior_parameters[0] # b_s = self.posterior_parameters[1] # ps = beta.cdf(1 / (a_star_t + 1), a=a_s, b=b_s) # entropies_start = -( # ps * np.log(np.maximum(ps, 1e-12)) # + (1 - ps) * np.log(np.maximum(1 - ps, +1e-12)) # ) # posterior_samples = 1 / beta.rvs(a=a_s, b=b_s) - 1 # new_as = np.ones(self.n_items) # new_as += a_s # new_bs = (geom.rvs(1 / (posterior_samples + 1)) - 1) + b_s # new_ps = beta.cdf(1 / (a_star_t + 1), a=new_as, b=new_bs) # new_entropies = -( # new_ps * np.log(np.maximum(new_ps, 1e-12)) # + (1 - new_ps) * np.log(np.maximum(1 - new_ps, +1e-12)) # ) # reductions = np.maximum(entropies_start - new_entropies, 1e-8) x = cp.Variable(self.n_items, pos=True) # deltas = cp.Parameter(self.n_items, pos=True) rewards = cp.Parameter(self.n_items, ) gains = cp.Parameter(self.n_items, pos=True) # exp_regret = r_star - x @ rewards deltas = r_star - x @ rewards exp_gain = x @ gains information_ratio = cp.quad_over_lin(deltas, exp_gain) objective = cp.Minimize(information_ratio) constraints = [0 <= x, x <= 1, cp.sum(x) == self.assortment_size] prob = cp.Problem( objective, constraints, ) rewards.value = expected_rewards gains.value = variances try: prob.solve(solver="ECOS") zeros_index = (x.value < 1e-3) ones_index = (x.value > 1 - 1e-3) nzeros = zeros_index.sum() nones = ones_index.sum() nitems = x.value.shape[0] logging.debug( f"{nitems - nones - nzeros} nstrict, {nones} ones, {nzeros} zeroes, {nitems} total items" ) if (nitems - nones - nzeros) == 2: all_items = np.arange(nitems) strict_items = all_items[~np. bitwise_or(zeros_index, ones_index)] probas = x.value[~np.bitwise_or(zeros_index, ones_index)] assert strict_items.shape[0] == 2, strict_items assert probas.shape[0] == 2, probas # 2 items to randomize the selection over logging.debug( f"items: {strict_items}, with probas: {probas}", ) rho = probas[0] u = np.random.rand() if rho <= u: remaning_item = strict_items[0] else: remaning_item = strict_items[1] action = np.sort( np.concatenate([ act_optimally(x.value, top_k=self.assortment_size - 1), np.array([remaning_item]) ])) else: action = act_optimally(x.value, top_k=self.assortment_size) if self.c % 5 == 121234: logging.debug( f"a:{action},x:{(100 * x.value).astype(int)},rew:{(100 * expected_rewards).astype(int)},gain:{(100 * np.sqrt(variances)).astype(int)}" ) logging.debug( f"if_optimal: {if_star}, rew:{logar(expected_rewards)}, probas: {logar(p_star)}", ) logging.debug( f"if_optimal: {logar(if_star)}, rew:{logar(expected_rewards)}, probas: {logar(p_star)}", ) logging.debug( f"n{self.posterior_parameters[0]}, v{self.posterior_parameters[1] / self.posterior_parameters[0]}," ) logging.debug(f"obj{prob.value}") except cp.SolverError: logging.warning("solver error") posterior_belief = self.sample_from_posterior(1) action = act_optimally(np.squeeze(posterior_belief), top_k=self.assortment_size) except TypeError: logging.warning("solver error") posterior_belief = self.sample_from_posterior(1) action = act_optimally(np.squeeze(posterior_belief), top_k=self.assortment_size) self.current_action = action self.c += 1 return action
def ts_cs_action(self): posterior_belief = self.sample_from_posterior(1) return act_optimally(np.squeeze(posterior_belief), top_k=self.assortment_size)
def proposal(self): self.prior_belief = self.sample_from_posterior( self.ids_sampler.n_samples) self.ids_sampler.update_belief(self.prior_belief) greedy_proposal = (np.sqrt(self.ids_sampler.delta_min) < self.regret_threshold) if greedy_proposal: assortment = act_optimally( np.squeeze(self.prior_belief.mean(0)), top_k=self.assortment_size, ) g_approx = info_gain_step( action=assortment, sampled_preferences=self.prior_belief, actions_star=self.ids_sampler.actions_star, counts=self.ids_sampler.counts_star, thetas=self.ids_sampler.thetas_star, ) g_approx = 1e-12 if g_approx < 1e-12 else g_approx d_approx = delta_step( action=assortment, sampled_preferences=self.prior_belief, r_star=self.ids_sampler.r_star, ) rho_policy = 0.5 ir_assortment = information_ratio( rho=rho_policy, d1=d_approx, d2=d_approx, g1=g_approx, g2=g_approx, ) self.data_stored["greedy"].append(1) elif self.objective == "exact": assortment, ir_assortment, rho_policy = ids_exact_action( g_=self.ids_sampler.g_, d_=self.ids_sampler.d_, actions_set=self.all_actions, sampled_preferences=self.prior_belief, r_star=self.ids_sampler.r_star, actions_star=self.ids_sampler.actions_star, counts_star=self.ids_sampler.counts_star, thetas_star=self.ids_sampler.thetas_star, ) else: assert self.objective == "lambda", "Choice of [exact, lambda]." if self.scaling == "autoreg": lambda_scaler = self.fitted_scaler elif self.scaling == "time": lambda_scaler = self.ids_sampler.lambda_algo * ( self.T - self.current_step) else: raise ValueError("Scaling: choice of [autoreg, time].") # print("check in the epoch setting") # print(self.ids_sampler.r_star) # print( # self.ids_sampler.d_( # np.arange(self.assortment_size), # self.prior_belief, # self.ids_sampler.r_star, # ) # ) assortment, ir_assortment, rho_policy = greedy_ids_action( scaling_factor=lambda_scaler, g_=self.ids_sampler.g_, d_=self.ids_sampler.d_, sampled_preferences=self.prior_belief, r_star=self.ids_sampler.r_star, actions_star=self.ids_sampler.actions_star, counts_star=self.ids_sampler.counts_star, thetas_star=self.ids_sampler.thetas_star, ) self.data_stored["greedy"].append(0) self.current_action = assortment self.fitted_scaler = ir_assortment self.data_stored["info_ratio"].append(ir_assortment) self.data_stored["entropy_a_star"].append( self.ids_sampler.a_star_entropy) self.data_stored["rho_policy"].append(rho_policy) self.data_stored["delta_min_2"].append(self.ids_sampler.delta_min**2) return assortment