Python Bandit.pull_arm示例

class UCB():
    def __init__(self, models, n, alpha_ucb, task=None):
        self.models = models
        self.n = n
        self.alpha = alpha_ucb
        self.num_arms = models.shape[1]
        self.num_models = models.shape[0]
        self.counts = np.zeros([self.num_arms])
        self.means = np.zeros([self.num_arms])
        if task is None:
            self.task = np.random.choice(self.num_models)
        else:
            self.task = task
        self.bandit = Bandit(self.models[self.task])

    def select_arm_ucb(self, t):
        for a in range(self.num_arms):
            if self.counts[a] == 0:
                return a
        ucb_values = self.means + np.sqrt(
            (self.alpha * np.log(t)) / self.counts)
        return np.argmax(ucb_values)

    def update_arm(self, action, reward):
        self.counts[action] += 1

        n = self.counts[action]
        value = self.means[action]
        self.means[action] = ((n - 1) / n) * value + (1. / n) * reward

    def run(self, T_list):
        regrets = []
        counts = np.zeros([max(T_list), self.num_arms])
        for t in range(self.n):
            action = self.select_arm_ucb(t)
            reward = self.bandit.pull_arm(action)
            self.update_arm(action, reward)
            if (t + 1) in T_list:
                regret = self.bandit.calculate_regret(
                    self.counts)  # keep track of regrets
                regrets.append(regret)
            counts[t, :] = self.counts

        return regrets, np.asarray(counts)

示例#2

显示文件

文件： hypo_test.py 项目： audhuang/oscillating-bandits

class HypoTest(): 
    def __init__(self, models, n, alpha, beta, task=None): 
        self.models = models
        self.n = n
        self.num_arms = models.shape[1]
        self.num_models = models.shape[0]
        self.counts = np.zeros([self.num_arms])
        self.means = np.zeros([self.num_arms])
        if task is None: 
            self.task = np.random.choice(self.num_models)
        else: 
            self.task = task
        self.bandit = Bandit(self.models[self.task])

        self.alpha = alpha
        self.beta = beta 

    def get_num_pulls(self, arm): 
        num = norm.ppf(self.beta) - norm.ppf(1.- self.alpha)
        den = np.abs(np.min(self.models[:, arm]) - np.max(self.models[:, arm]))
        return (num / den)**2

    def calculate_c(self, arm, num_pulls): 
        c = np.min(self.models[:, arm]) + 1. / np.sqrt(num_pulls)*norm.ppf(1.-self.alpha)
        return c

    def calculate_power(self, arm, num_pulls, c): 
        beta = norm.cdf(np.sqrt(num_pulls)*(c - self.models[1, arm]))
        return beta

    def update(self, arm_t, r): 
        self.counts[arm_t] += 1
        n = self.counts[arm_t]
        value = self.means[arm_t]
        self.means[arm_t] = ((n-1)/n) * value + (1./n) * r


    def run(self, arm, num_pulls = None, c = None): 
        if num_pulls is not None: 
            num_pulls = np.ceil(self.get_num_pulls(arm))
            c = self.calculate_c(arm, num_pulls) 
        action = arm 
        total = 0
        regrets = []
        for t in range(self.n): 
            if t < num_pulls: 
                reward = self.bandit.pull_arm(action)
                total += reward
            elif t == num_pulls: 
                average = total / num_pulls
                # print(average)
                # import ipdb; ipdb.set_trace()
                if average > c: 
                    action = np.argmax(self.models[:, arm])
                else: 
                    action = np.argmin(self.models[:, arm])
                reward = self.bandit.pull_arm(action)
            else: 
                reward = self.bandit.pull_arm(action)
            self.update(action, reward)

        #     if t % 100 == 0 and t > 0: 
        #         regret = self.bandit.calculate_regret(self.counts)
        #         regrets.append(regret)
        # return regrets
        regret = self.bandit.calculate_regret(self.counts)
        return regret

示例#3

显示文件

class mUCB():
    def __init__(self, models, n, alpha_eps, task=None):
        self.models = models
        self.n = n
        self.delta = 1. / self.n
        self.alpha_eps = alpha_eps
        self.num_arms = models.shape[1]
        self.num_models = models.shape[0]
        self.counts = np.zeros([self.num_arms])
        self.means = np.zeros([self.num_arms])
        if task is None:
            self.task = np.random.choice(self.num_models)
        else:
            self.task = task
        self.bandit = Bandit(self.models[self.task])

    # calculate confidence epsilon_i,t
    def calculate_e(self):
        eps = np.sqrt(
            np.log(self.num_models * self.n**self.alpha_eps / self.delta) /
            (2 * self.counts))
        return eps

    # calculate set of compatible models Theta_t
    def get_Theta_t(self, eps):
        indices = []
        for i in range(self.num_models):
            model = self.models[i]
            check = True
            for j in range(self.num_arms):
                if np.abs(model[j] - self.means[j]) > eps[j]:
                    check = False
            if check == True:
                indices.append(i)
        return self.models[indices, :]

    # get arm with highest reward
    def get_arm_t(self, Theta_t):
        index = np.unravel_index(Theta_t.argmax(), Theta_t.shape)
        return index[1]

    # update empirical estimates
    def update(self, arm_t, r):
        self.counts[arm_t] += 1
        n = self.counts[arm_t]
        value = self.means[arm_t]
        self.means[arm_t] = ((n - 1) / n) * value + (1. / n) * r

    def run(self, T_list):
        regrets = []
        counts = np.zeros([max(T_list), self.num_arms])

        # pull each arm once
        for t in range(self.num_arms):
            r = self.bandit.pull_arm(t)
            self.update(t, r)
            counts[t] = self.counts

        # loop through mUCB algorithm
        for t in range(self.num_arms, self.n):
            eps = self.calculate_e()  # calculate confidence
            Theta_t = self.get_Theta_t(eps)  # get compaitble models
            if len(Theta_t) == 0:
                return False, np.asarray(counts)
            else:
                arm_t = self.get_arm_t(Theta_t)  # get arm with maximum reward
            r = self.bandit.pull_arm(arm_t)  # pull the arm
            self.update(arm_t, r)

            if (t + 1) in T_list:
                regret = self.bandit.calculate_regret(
                    self.counts)  # keep track of regrets
                regrets.append(regret)
            counts[t, :] = self.counts
        return regrets, np.asarray(counts)

示例#4

显示文件

文件： structured_bandit.py 项目： audhuang/oscillating-bandits

class StructuredBandit():
    def __init__(self, models, n, a, task=None):
        self.models = models
        self.n = n
        self.a = a
        self.num_arms = models.shape[1]
        self.num_models = models.shape[0]
        self.counts = np.zeros([self.num_arms])
        self.means = np.zeros([self.num_arms])
        if task is None:
            self.task = np.random.choice(self.num_models)
        else:
            self.task = task
        self.bandit = Bandit(self.models[self.task])

    def C_theta(self, theta):
        i_star = np.argmax(theta)
        i_list = []
        for m in range(len(self.models)):
            j_star = np.argmax(self.models[m])
            if i_star != j_star:
                i_list.append(np.square(self.models[m] - theta))
        return 0.5 * np.array(i_list)

    def get_alpha(self, theta):
        d_theta = np.max(theta) - theta
        i_list = self.C_theta(theta)

        alpha = cp.Variable(self.num_arms, integer=True)
        objective = cp.Minimize(d_theta * alpha)
        constraints = [
            alpha >= 0,
            i_list * alpha >= 1,
        ]
        prob = cp.Problem(objective, constraints)
        prob.solve()
        #print(prob.value)
        return np.array(alpha.value)

    def get_constraints(self, theta):
        d_theta = np.max(theta) - theta
        i_list = self.C_theta(theta)

        alpha = cp.Variable(self.num_arms)
        objective = cp.Minimize(0)
        constraints = [
            alpha >= 0,
            i_list * alpha >= 1,
        ]
        prob = cp.Problem(objective, constraints)
        prob.solve()
        return np.array(alpha.value)

    def run(self, T_list, method=None):
        counts = np.zeros([self.num_arms])
        totals = np.zeros([self.num_arms])
        regrets = []
        # means = np.zeros([num_arms])
        n_e = 0
        temp = [0, 0, 0]
        counts_all = np.zeros([max(T_list), self.num_arms])
        exps = np.zeros([self.num_models])

        for t in range(self.n):
            if t < self.num_arms:
                reward = self.bandit.pull_arm(t)
                exps += np.square(self.models[:, t] - reward)
                totals[t] += reward
                counts[t] += 1
            else:
                if method == 'mle':
                    # average = np.divide(totals, counts)
                    # mle = np.argmin(np.linalg.norm(self.models - average, axis=1))
                    # estimate = self.models[mle]
                    # exps = []
                    # for m in range(num_modes):
                    #     mode = models[m]
                    #     means = mode[arms[:t]]
                    #     exp = np.sum(np.square(rewards[:t] - means))
                    #     exps.append(exp)
                    # estimate = models[np.argmin(exps)]
                    estimate = self.models[np.argmin(exps)]
                else:
                    estimate = np.divide(totals, counts)

                # line 6
                i_list = self.C_theta(estimate)
                test = counts / (self.a * np.log(t))
                if np.all(np.dot(i_list, test) >= 1):
                    arm = np.argmax(estimate)
                    temp[0] += 1
                # line 9
                else:
                    # line 10
                    if np.min(counts) < (
                            n_e / (2 * self.num_arms)
                    ):  # (np.sqrt(n_e)/ (2*self.num_arms)):
                        arm = np.argmin(counts)
                        temp[1] += 1
                    # line 12
                    else:
                        alpha = self.get_alpha(estimate)
                        if alpha.size == self.num_arms:
                            indices = np.where(test < alpha)
                            arm = random.choice(indices)[0]
                        else:
                            # constraints = self.get_constraints(estimate)
                            # if constraints.size == self.num_arms:
                            #     constraints = np.round(constraints)
                            #     indices = np.where(test < constraints)
                            #     arm = random.choice(indices)[0]
                            # else:
                            #     arm = random.choice(range(self.num_arms))
                            arm = random.choice(range(self.num_arms))
                        temp[2] += 1
                    n_e += 1

                reward = self.bandit.pull_arm(arm)
                exps += np.square(self.models[:, arm] - reward)
                totals[arm] += reward
                counts[arm] += 1
            if (t + 1) in T_list:
                regret = self.bandit.calculate_regret(
                    counts)  # keep track of regrets
                regrets.append(regret)
            counts_all[t, :] = counts
        # print('estimate: ', estimate)
        # print('counts: ', counts)
        # print('algo: ', temp)
        # regret = self.bandit.calculate_regret(counts)
        # return regret, counts
        return regrets, np.asarray(counts_all)