def _compute_IL_posterior(self, t): # W = 0 # for agent, reputation in self.agent_reputations_overall.items(): # W += int(min(1, reputation) == 1) max_pulls = max([arm.pulls for arm in self.bandit.arms]) for (arm_index, arm) in enumerate(self.bandit.arms): self.prediction_history[arm_index]=[] self.posterior_history[arm_index] = [BetaDistribution(0.5, 1-0.5)] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) num_reports = 0 weight_0 = 1 weight = copy.deepcopy(weight_0) #have to make dependant on initial reputation and running_sum = 0.5 * weight_0 #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): # print(agent.id) gamma = min(self.agent_reputations_overall[agent], 1) #give full weight to currnt agents reports temp_running_sum = running_sum + (self.agency.agent_reports[agent][arm_index]) temp_weight = weight + 1 q_j = temp_running_sum / temp_weight alpha_j = q_j * (agent.num_reports) #+ pre_alpha beta_j = (1-q_j) * (agent.num_reports) # pre_beta running_sum += self.agency.agent_reports[agent][arm_index] * gamma weight += gamma self.prediction_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_j), copy.deepcopy(beta_j))) q_j_tilde = running_sum/weight # (1-gamma)*q_j_tilde + gamma*(q_j) num_reports += gamma * agent.num_reports alpha_tilde = q_j_tilde * (num_reports) beta_tilde = (1-q_j_tilde) * (num_reports) self.posterior_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_tilde), copy.deepcopy(beta_tilde))) running_sum -= 0.5 * weight_0 weight -= weight_0 q_j_tilde = running_sum/weight # num_reports = min(num_reports, max(100 - arm.pulls, 0)) # # num_reports = min(num_reports, 30) # # num_reports = min(num_reports, max_pulls-arm.pulls) alpha_tilde = q_j_tilde * (num_reports) beta_tilde = (1-q_j_tilde) * (num_reports) # print("arm:", arm_index) # print("pre:",pre_alpha, pre_beta) # print("post:", pre_alpha + alpha_tilde, pre_beta + beta_tilde) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def _compute_IL_posterior(self, t): for (arm_index, arm) in enumerate(self.bandit.arms): self.prediction_history[arm_index]=[] q_j_tilde = 0.5 self.posterior_history[arm_index] = [BetaDistribution(0.5, 1-0.5)] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) num_reports = 0 weight_0 = np.log(t) weight = copy.deepcopy(weight_0) #have to make dependant on initial reputation and running_sum = 0.5 * weight #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): # print(agent.id) time_factor = 1 gamma = min(1, self.agent_reputations[agent]) #give full weight to currnt agents reports temp_running_sum = running_sum + (self.agency.agent_reports[agent][arm_index] * time_factor) temp_weight = weight + time_factor q_j = temp_running_sum / temp_weight alpha_j = q_j * (agent.num_reports) #+ pre_alpha beta_j = (1-q_j) * (agent.num_reports) # pre_beta running_sum += self.agency.agent_reports[agent][arm_index] * gamma weight += gamma self.prediction_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_j), copy.deepcopy(beta_j))) q_j_tilde = running_sum/weight # (1-gamma)*q_j_tilde + gamma*(q_j) num_reports += gamma * agent.num_reports alpha_tilde = q_j_tilde * (num_reports) beta_tilde = (1-q_j_tilde) * (num_reports) self.posterior_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_tilde), copy.deepcopy(beta_tilde))) # if alpha_cuml != 0: # arm.influence_reward_dist.set_params(alpha_cuml + pre_alpha, beta_cuml + pre_beta) # else: # arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta) running_sum -= 0.5 * weight_0 weight -= weight_0 q_j_tilde = running_sum/weight # print("round", t) # print(num_reports) alpha_tilde = q_j_tilde * (num_reports) beta_tilde = (1-q_j_tilde) * (num_reports) # print(alpha_tilde, beta_tilde) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def _compute_IL_posterior(self, t): # print("reputations:", self.agent_reputations) for (arm_index, arm) in enumerate(self.bandit.arms): self.posterior_history[arm_index] = [BetaDistribution(1, 1)] self.prediction_history[arm_index] = [] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) weight = max(0, 1 - (np.log(self.bandit.T) / t)) running_weighted_sum = 0.5 * weight # test = weight*copy.deepcopy(arm.reward_dist.mean()) + (1-weight) * (0.5) # self.posterior_history[arm_index] = [BetaDistribution(test, (1-test))] num_trust = 1 #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): gamma = min(1, self.agent_reputations[agent_index]) if gamma >= 1: num_trust += 1 alpha_j = self.agency.agent_reports[agent_index][arm_index] * ( agent.num_reports) beta_j = (1 - self.agency.agent_reports[agent_index][arm_index] ) * (agent.num_reports) self.prediction_history[arm_index].append( BetaDistribution(alpha_j, beta_j)) running_weighted_sum += gamma * self.agency.agent_reports[ agent_index][arm_index] weight += gamma # running_alpha_sum += gamma * self.agency.agent_reports[agent_index][arm_index] * (agent.num_reports) # running_beta_sum += gamma * (1-self.agency.agent_reports[agent_index][arm_index]) * (agent.num_reports) # weights += gamma # alpha_tilde = running_alpha_sum/weights # beta_tilde = running_beta_sum/weights q_tilde = running_weighted_sum / weight alpha_tilde = q_tilde * (agent.num_reports * num_trust) beta_tilde = (1 - q_tilde) * (agent.num_reports * num_trust) self.posterior_history[arm_index].append( BetaDistribution(alpha_tilde, beta_tilde)) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def _compute_IL_posterior(self, t): for (arm_index, arm) in enumerate(self.bandit.arms): self.prediction_history[arm_index]=[] q_j_tilde = 0.5 self.posterior_history[arm_index] = [BetaDistribution(q_j_tilde, 1-q_j_tilde)] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) weight = 0 running_sum = 0 num_reports = 0 N = len(self.agency.agents) alpha_test, beta_test = 0, 0 #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): #get gamma gamma = min(1, self.agent_reputations[agent_index]) #give full weight to currnt agents reports temp_running_sum = running_sum + (self.agency.agent_reports[agent_index][arm_index] * 1) temp_weight = weight + 1 q_j = temp_running_sum / temp_weight temp_num_reports = num_reports + agent.num_reports alpha_j = q_j * (agent.num_reports * temp_num_reports) #+ pre_alpha beta_j = (1-q_j) * (agent.num_reports * temp_num_reports) # pre_beta running_sum += self.agency.agent_reports[agent_index][arm_index] * gamma num_reports += gamma * agent.num_reports weight += gamma self.prediction_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_j), copy.deepcopy(beta_j))) q_j = copy.deepcopy(alpha_j/(alpha_j + beta_j)) q_j_tilde = (1-gamma)*q_j_tilde + gamma*(q_j) alpha_tilde = q_j_tilde * (num_reports) beta_tilde = (1-q_j_tilde) * (num_reports) self.posterior_history[arm_index].append(BetaDistribution(copy.deepcopy(alpha_tilde), copy.deepcopy(beta_tilde))) # if alpha_test != 0: # arm.influence_reward_dist.set_params(alpha_test + pre_alpha, beta_test + pre_beta) # else: # arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def _compute_IL_posterior(self, t): # print("reputations:", self.agent_reputations) for (arm_index, arm) in enumerate(self.bandit.arms): # self.posterior_history[arm_index] = [BetaDistribution(1, 1)] self.prediction_history[arm_index] = [] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) # self.posterior_history[arm_index] = [copy.deepcopy(arm.reward_dist)] # k = 2/(len(self.agency.agents) + 1) pre_mean = copy.deepcopy(arm.reward_dist.mean()) prev_ema = copy.deepcopy(self.agency.agent_reports[0][arm_index]) # q_j_tilde = copy.deepcopy(0.5) k = 0.75 self.posterior_history[arm_index] = [ BetaDistribution(0.5, 1 - 0.5) ] q_j_tilde = copy.deepcopy(0.5) #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): # print("agent:", agent_index) # print("agent reputation:", self.agent_reputations[agent_index]) k = 1 - 1 / (t + 1) gamma = min(1, self.agent_reputations[agent_index]) current_ema = (self.agency.agent_reports[agent_index] [arm_index] - prev_ema) * k + prev_ema prev_ema = copy.deepcopy(current_ema) alpha_j = current_ema * (agent.num_reports) #+ pre_alpha beta_j = (1 - current_ema) * (agent.num_reports) # pre_beta self.prediction_history[arm_index].append( BetaDistribution(alpha_j, beta_j)) q_j = copy.deepcopy(alpha_j / (alpha_j + beta_j)) q_j_tilde = (1 - gamma) * q_j_tilde + gamma * (q_j) alpha_tilde = q_j_tilde * (agent.num_reports) beta_tilde = (1 - q_j_tilde) * (agent.num_reports) self.posterior_history[arm_index].append( BetaDistribution(alpha_tilde, beta_tilde)) # print("final:", alpha_tilde + pre_alpha, beta_tilde + pre_beta) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def _compute_IL_posterior(self, t): # print("reputations:", self.agent_reputations) for (arm_index, arm) in enumerate(self.bandit.arms): # self.posterior_history[arm_index] = [BetaDistribution(1, 1)] self.prediction_history[arm_index] = [] pre_alpha, pre_beta = copy.deepcopy(arm.reward_dist.get_params()) new_mean = copy.deepcopy(arm.reward_dist.mean()) weight = 1 running_weighted_sum = weight * new_mean q_tilde = running_weighted_sum / weight self.posterior_history[arm_index] = [ BetaDistribution(q_tilde, 1 - q_tilde) ] k = 2 / (len(self.agency.agents) + 1) prev_ema = self._compute_SMA(arm_index) #iterate through each agent and process their report for agent_index, agent in enumerate(self.agency.agents): gamma = min(1, self.agent_reputations[agent_index]) current_ema = (self.agency.agent_reports[agent_index] [arm_index] - prev_ema) * k + prev_ema alpha_j = current_ema * (agent.num_reports) beta_j = (1 - current_ema) * (agent.num_reports) self.prediction_history[arm_index].append( BetaDistribution(alpha_j, beta_j)) q_j = copy.deepcopy(current_ema) running_weighted_sum += gamma * q_j weight += gamma q_tilde = running_weighted_sum / weight alpha_tilde = q_tilde * (agent.num_reports) beta_tilde = (1 - q_tilde) * (agent.num_reports) self.posterior_history[arm_index].append( BetaDistribution(alpha_tilde, beta_tilde)) # print("final:", alpha_tilde + pre_alpha, beta_tilde + pre_beta) arm.influence_reward_dist.set_params(alpha_tilde + pre_alpha, beta_tilde + pre_beta)
def mean_confidence_interval(data, confidence=0.95): a = 1.0 * np.array(data) n = len(a) m, se = np.mean(a, 0), scipy.stats.sem(a, 0) h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) return m, h T = 500 K = 5 num_exp = 10 num_reports = 10 trust = [False, False, True, False] initial_reputations = 1 world_priors = [BetaDistribution(1, 1) for k in range(K)] nature = Nature(K, world_priors, len(trust)) bayes_ucb = BayesUCB(T, K, world_priors) random = Random(T, K, world_priors) thompson = ThompsonSampling(T, K, world_priors) oracle = Oracle(copy.deepcopy(bayes_ucb), nature.agency) bandits = [thompson, bayes_ucb, random] key_map = {thompson: "Thompson", bayes_ucb: "Bayes UCB", random: "Random"} key_color = {thompson: "red", bayes_ucb: "blue", random: "green"} cumulative_regret_history = { bandit: np.zeros((num_exp, T)) for bandit in bandits }