示例#1
0
def compare_all(runs=2000, time=1000):
    """Compare all algorithms.
    """
    labels = ['epsilon-greedy', 'gradient bandit',
              'UCB', 'optimistic initialization']
    generators = [lambda epsilon: Bandit(epsilon=epsilon, sample_averages=True),
                  lambda alpha: Bandit(gradient=True, step_size=alpha, gradient_baseline=True),
                  lambda coef: Bandit(epsilon=0, UCB_param=coef, sample_averages=True),
                  lambda initial: Bandit(epsilon=0, optimistic_init=initial, step_size=0.1)]
    parameters = [np.arange(-7, -1, dtype=np.float),
                  np.arange(-5, 2, dtype=np.float),
                  np.arange(-4, 3, dtype=np.float),
                  np.arange(-2, 3, dtype=np.float)]

    bandits = []
    for generator, parameter in zip(generators, parameters):
        for param in parameter:
            bandits.append(generator(pow(2, param)))

    _, average_rewards = simulate(bandits, runs, time)
    rewards = np.mean(average_rewards, axis=1)

    i = 0
    for label, parameter in zip(labels, parameters):
        l = len(parameter)
        plt.plot(parameter, rewards[i:i+l], label=label)
        i += l
    plt.xlabel('Parameter(2^x)')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_6_compare_all.png"))
    plt.close()
def run_experiment(m1, m2, m3, eps, N, experiment_name):
    bandits = [Bandit(m1), Bandit(m2), Bandit(m3)]

    data = np.empty(N)

    for i in range(N):
        # epsilon greedy
        p = np.random.random()
        if p < eps:
            j = np.random.choice(3)
        else:
            j = np.argmax([b.mean for b in bandits])
        x = bandits[j].pull()
        bandits[j].update(x)

        # for the plot
        data[i] = x
    cumulative_average = np.cumsum(data) / (np.arange(N) + 1)

    # # plot moving average ctr
    # plt.plot(cumulative_average)
    # plt.plot(np.ones(N) * m1)
    # plt.plot(np.ones(N) * m2)
    # plt.plot(np.ones(N) * m3)
    # plt.xscale('log')
    # plt.show()

    for i, b in enumerate(bandits):
        print(f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}')

    return cumulative_average
示例#3
0
def gradient(runs=2000, time=1000):
    """Test the k-armed bandit with gradient.
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    bandits = []
    bandits.append(Bandit(gradient=True, step_size=0.1,
                          gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.1,
                          gradient_baseline=False, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4,
                          gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4,
                          gradient_baseline=False, true_reward=4))

    print("===== %s =====" % ("Gradient"))
    best_action_counts, _ = simulate(bandits, runs, time)
    labels = ['alpha = 0.1, with baseline',
              'alpha = 0.1, without baseline',
              'alpha = 0.4, with baseline',
              'alpha = 0.4, without baseline', ]

    for i in range(0, len(bandits)):
        plt.plot(best_action_counts[i], label=labels[i])
    plt.xlabel('Steps')
    plt.ylabel('% Optimal action')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_5_gradient.png"))
    plt.close()
示例#4
0
def experiment2():
    params = [{
        "time_horizon" : 500,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 500,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 500,
        "number_of_arms" : 20
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 20
    },
    ]
    
    for i in range(len(params)):
        results = []
        time_horizon = params[i]["time_horizon"]
        number_of_arms = params[i]["number_of_arms"]
        agent1 = agentFactory("random",time_horizon,number_of_arms)
        
        epsilons = []
        for j in range(time_horizon):
            epsilons.append(math.pow((j+1)*number_of_arms*math.log(j+1),1/3))
        
        agent2 = agentFactory("epsilon-greedy",time_horizon,number_of_arms,epsilons)
        agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100)
        agent4 = agentFactory("ucb1",time_horizon,number_of_arms)
        agent5 = agentFactory("successive-elimination",time_horizon,number_of_arms)

        bandit = Bandit(time_horizon,number_of_arms,agent1)
        results.append(mc_simulate(n_sim,bandit))
 
        bandit = Bandit(time_horizon,number_of_arms,agent2)
        results.append(mc_simulate(n_sim,bandit))
        
        bandit = Bandit(time_horizon,number_of_arms,agent3)
        results.append(mc_simulate(n_sim,bandit,"N=T/100"))

        bandit = Bandit(time_horizon,number_of_arms,agent4)
        results.append(mc_simulate(n_sim,bandit))
        
        bandit = Bandit(time_horizon,number_of_arms,agent5)
        results.append(mc_simulate(n_sim,bandit))

        plot(results,time_horizon,params[i])
示例#5
0
def plot_figures(k,
                 n_bandits,
                 n_steps,
                 eps_list,
                 weight_fn=sample_average,
                 random_walk=False,
                 y_bounds=[0, 1.5],
                 Q_1=0,
                 show=True,
                 method='epsilon-greedy',
                 extra_label='',
                 title=None,
                 percentage=False):
    avg_rew_per_eps = [np.zeros(n_steps) for _ in range(len(eps_list))]
    avg_rew_in_perc = [np.zeros(n_steps) for _ in range(len(eps_list))]
    for i in range(n_bandits):
        print(i)
        bandit_pb = Bandit(k)
        for i, eps in enumerate(eps_list):
            _, per, avg_rew, _ = a_simple_bandit_algorithm(
                bandit_pb,
                n_iterations=n_steps,
                eps=eps,
                weight_fn=weight_fn,
                random_walk=random_walk,
                Q_1=Q_1,
                method=method)
            avg_rew_per_eps[i] += avg_rew
            avg_rew_in_perc[i] += per

    to_plot = avg_rew_in_perc if percentage else avg_rew_per_eps
    bounds = [0, 100] if percentage else y_bounds
    plot_average(to_plot, eps_list, n_bandits, bounds, show, extra_label,
                 title, percentage)
示例#6
0
def epsilon_greedy(runs=2000, time=1000):
    """Test the k-armed bandit with the policy of epsilon greedy. 
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    epsilons = [0, 0.01, 0.1, 0.2]
    bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]

    print("===== {} =====".format("Epsilon greedy"))
    best_action_counts, rewards = simulate(bandits, runs, time)

    plt.figure(figsize=(10, 20))
    plt.subplot(2, 1, 1)
    for eps, rewards in zip(epsilons, rewards):
        plt.plot(rewards, label="epsilon = %.02f" % (eps))
    plt.xlabel('steps')
    plt.ylabel('average reward')
    plt.legend()

    plt.subplot(2, 1, 2)
    for eps, counts in zip(epsilons, best_action_counts):
        plt.plot(counts, label="epsilon = %.02f" % (eps))
    plt.xlabel('steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_2_epsilon_greedy.png"))
    plt.close()
示例#7
0
def fig_2_5(n_bandits=2000, n_steps=1000, k=10, alpha_list=[0.1, 0.4]):
  d = {}
  for baseline in [False, True]:
    for alpha in alpha_list:
      d[(baseline, alpha)] = np.zeros(n_steps)
  for n in range(n_bandits):
    print(n)
    bandit = Bandit(k, mean=4)
    for baseline in [False, True]:
      for alpha in alpha_list:
        result_arr, _ = gradient_bandit(bandit, n_steps=n_steps,
                                                alpha=alpha, baseline=baseline)
        d[(baseline, alpha)] += result_arr

  def label(baseline, alpha):
    return ("with" if baseline else "without") + f" baseline, alpha={alpha}"
  for key, avg_rew in d.items():
    plt.plot((avg_rew / n_bandits) * 100, label=label(key[0], key[1]))
  axes = plt.gca()
  axes.set_ylim([0, 100])
  plt.xlabel("Steps")
  plt.ylabel("Optimal Action %")
  plt.title("Figure 2.5")
  plt.legend()
  plt.show()
示例#8
0
def create_bandit(means, variances):
    def reward_fn(mu, variance):
        stddev = np.sqrt(variance)
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(lambda a: reward_fn(*a), zip(means, variances)))
    return Bandit(arms)
示例#9
0
文件: train.py 项目: zhasulan/mab
    def run(self, name, agent_name, **agent_parameters):
        """

        :param name: Name of experiments
        :param agent_name: Name of Agent
        :param agent_parameters: Parameters of Agent
        """
        rewards = np.zeros(self.pulls)
        optimal_actions = np.zeros(self.pulls)

        for _ in tqdm(range(self.experiments)):
            bandit = Bandit(self.pulls, self.actions, agent_name,
                            **agent_parameters)
            reward, optimal_action = bandit.experiment()
            rewards += reward
            optimal_actions += optimal_action

            pass

        rewards /= np.float(self.experiments)
        optimal_actions /= np.float(self.experiments)

        self.values[name] = {}
        self.values[name]['rewards'] = rewards
        self.values[name]['optimal_actions'] = optimal_actions

        pass
示例#10
0
def run_greedy(number_bandits, epsilon, iterations):
    bandits = [Bandit(i + 1, 0) for i in range(number_bandits)]
    current_best = bandits[0]
    data = np.empty(iterations)
    print(f'Starting with bandit {current_best.true_mean}.')
    for i in range(iterations):
        explore_exploit = np.random.rand()
        bandit = current_best
        # explore
        if explore_exploit < epsilon:
            selection = np.random.randint(0, number_bandits)
            #print(f'Machine {selection} selected.')
            bandit = bandits[selection]
        # exploit
        value = bandit.pull()
        bandit.update(value)
        data[i] = value
        #update
        if current_best.current_mean < bandit.current_mean:
            print(f'Updated to bandit {bandit.true_mean}')
            current_best = bandit

    print(f'Chose bandit {current_best.true_mean}')
    cumulative_average = np.cumsum(data) / (np.arange(iterations) + 1)
    plt.plot(cumulative_average)
    for i in range(number_bandits):
        plt.plot(np.ones(iterations) * (i + 1))
    plt.xscale('log')
    plt.show()
    return cumulative_average
示例#11
0
def captions_bandit():
    data = captions_data

    def reward_fn(percentage):
        return lambda: np.random.choice((0, 0.5, 1), 1, p=percentage)[0]

    arms = list(map(reward_fn, data))
    return Bandit(arms)
示例#12
0
def bandit():
    bandit = Bandit()
    game.init_bandit(bandit)
    player = game.get_player()
    return render_template('bandit.html',
                           credits_demanded=bandit.get_credits_demanded(),
                           fskills=player.get_fighter_skills(),
                           pskills=player.get_pilotskills())
示例#13
0
def experiment1():
    params = [{
        "time_horizon" : 1000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 1000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 1000,
        "number_of_arms" : 20
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 20
    },
    ]
    
    for i in range(len(params)):
        results = []
        time_horizon = params[i]["time_horizon"]
        number_of_arms = params[i]["number_of_arms"]
        agent1 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,5)
        agent2 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/10)
        agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100)
        
        bandit = Bandit(time_horizon,number_of_arms,agent1)
        results.append(mc_simulate(n_sim,bandit,"N=5"))
 
        bandit = Bandit(time_horizon,number_of_arms,agent2)
        results.append(mc_simulate(n_sim,bandit,"N=T/10"))
        
        bandit = Bandit(time_horizon,number_of_arms,agent3)
        results.append(mc_simulate(n_sim,bandit,"N=T/100"))

        plot(results,time_horizon,params[i])
def main():
    """
    Constants
    """
    k: int = 20
    epsilon: float = 0.01
    init_val: int = 10
    c: float = 1.0
    max_time: int = 1000
    rounds: int = 1000
    policy: Policy = EpsilonGreedyPolicy(epsilon)
    agent = Agent(k, policy)
    bandits = [Bandit() for _ in range(k)]

    def play_round():
        """
        Simulates one round of the game.
        """
        # get the next action
        action = agent.choose_action()
        # get a reward from the bandit
        reward = bandits[action].get_reward()
        # play the action
        agent.play_action(action, reward)
        return reward

    def reset():
        agent.reset()
        for bandit in bandits:
            bandit.reset()
        optimal_bandit = np.argmax([bandit.get_reward() for bandit in bandits])

    def print_bandits():
        for i, bandit in enumerate(bandits):
            print('Bandit {} reward={}'.format(i, bandit.get_reward()))

    def experiment():
        scores = np.zeros(max_time, dtype=float)
        for _ in range(rounds):
            for t in range(max_time):
                scores[t] += play_round()
            reset()

        return scores / rounds

    def plot(label):
        print_bandits()
        scores = experiment()
        time = range(max_time)
        plt.title(label + " for k = " + str(k))
        plt.ylim([0.0, 2.0])
        plt.xlabel('Steps')
        plt.ylabel('Avg. Reward')
        plt.scatter(x=time, y=scores, s=0.5)
        plt.show()

    plot(policy.__str__())
示例#15
0
def captions_bandit(n):
    categories = np.array([0, 0.5, 1])
    data = captions_data()[:n]

    def reward_fn(percentage):
        return lambda: np.random.choice(categories, 1, p=percentage)[0]

    arms = list(map(reward_fn, data))
    return Bandit(arms)
示例#16
0
def poisson_exp_bandit():
    n = n_arms()
    means_ = poisson_exp_means()

    def reward_fn(lambda_):
        return lambda: np.random.poisson(lambda_)

    arms = list(map(reward_fn, means_))
    return Bandit(arms)
def sparse_bandit(n_arms, variance):
    means = sparse_means(n_arms)
    stddev = np.sqrt(variance)

    def reward_fn(mu):
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(reward_fn, means))
    return Bandit(arms)
示例#18
0
def polynomial_bandit(n, variance):
    means = polynomial_means(n)
    stddev = np.sqrt(variance)

    def reward_fn(mu):
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(reward_fn, means))
    return Bandit(arms)
示例#19
0
def upper_confidence_bound(runs=2000, time=1000):
    """Test the k-armed bandit with UCB.
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    bandits = []
    bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
    bandits.append(Bandit(epsilon=0.1, sample_averages=True))
    _, average_rewards = simulate(bandits, runs, time)

    plt.plot(average_rewards[0], label='UCB c = 2')
    plt.plot(average_rewards[1], label='epsilon greedy epsilon = 0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_4_UCB.png"))
    plt.close()
示例#20
0
def get_market_data():
    global username
    if request.method == "GET":
        band1 = Bandit()
        trader1 = Trader()
        police1 = Police()
        cost1 = Cost()

        market_inventory = regions.child(currRegion).child('inventory').get()
        ship_inventory = users.child(username).child('ship').child(
            'inventory').get()
        difficulty = users.child(username).child('difficulty').get()
        ship_cargo = users.child(username).child('ship').child('cargo').get()
        credit_val = users.child(username).child('credit').get()
        ship_health = users.child(username).child('ship').child('health').get()
        pilot_skill = users.child(username).child('skills').child(
            'pilot').get()
        engineer = users.child(username).child('skills').child(
            'engineer').get()
        fighter_skill = users.child(username).child('skills').child(
            'fighter').get()
        merchant_skill = users.child(username).child('skills').child(
            'merchant').get()
        fuel = users.child(username).child('ship').child('fuel').get()
        fuelcost = cost1.calculate_fuel(difficulty, credit_val)
        demand = band1.calculate_demand(difficulty, credit_val)
        repair = cost1.calculate_repair(difficulty, engineer, credit_val)
        price = trader1.item_to_sell(difficulty, credit_val)
        qty = trader1.qty
        item = trader1.item
        stolen = police1.stolen_item(ship_inventory)
        to_return = {
            'username': username,
            'currRegion': currRegion,
            'market_inventory': market_inventory,
            'ship_inventory': ship_inventory,
            'cargo': ship_cargo,
            'credit': credit_val,
            'health': ship_health,
            'demand': demand,
            'qty': qty,
            'item': item,
            'price': price,
            'eng': engineer,
            'stolen': stolen,
            'difficulty': difficulty,
            'pilot': pilot_skill,
            'fighter': fighter_skill,
            'fuel': fuel,
            'fuelcost': fuelcost,
            'merch': merchant_skill,
            'repair': repair
        }
        return to_return
    return None
示例#21
0
def run_experiment(m1, m2, m3, N, experiment_name):
    bandits = [Bandit(m1), Bandit(m2), Bandit(m3)]

    data = np.empty(N)

    for i in range(N):
        j = np.argmax([ucb(b.mean, i + 1, b.N) for b in bandits])
        x = bandits[j].pull()
        bandits[j].update(x)

        # for the plot
        data[i] = x
    cumulative_average = np.cumsum(data) / (np.arange(N) + 1)

    for i, b in enumerate(bandits):
        print(
            f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}'
        )

    return cumulative_average
示例#22
0
 def __init__(self, predict_market, num_bids, trials,
              label='Multi-Armed Prediction Market Bandit'):
     self.predict_market = predict_market
     self.n_arms = predict_market.arms
     self.agents = predict_market.agents
     self.data = predict_market.dataframe
     self.num_bids = num_bids
     self.label = label
     self.bandit = Bandit(self.n_arms, self.data)
     self.trials = trials
     self.scores = None
     self.optimal = None
示例#23
0
 def init_bandits(self, holdout=True):
     """Specify some way to split up the indices?"""
     df_len = len(self.data_sim.df.index)
     num_divide = self.n_bandits + 1 if holdout else self.n_bandits
     increment = int(df_len / num_divide)
     for i in range(self.n_bandits):
         indices = [i * increment, (i + 1) * increment]
         self.bandits.append(Bandit(self.n_arms, self.data_sim.df,
                                    data_indices=indices))
     if holdout:
         self.df_holdout = self.data_sim.df[df_len - increment + 1:]
     self.trials = increment if self.trials is None else self.trials
示例#24
0
def createBanditInstancesAndSimulate(params,n_mc_sim):
    n_sim = n_mc_sim
    for i in range(len(params)):
        results = []
        time_horizon = params[i]['time_horizon']
        number_of_arms = params[i]['number_of_arms']
        number_of_exploration_per_arm = params[i]['number_of_exploration_per_arm']

        exp_agent = ExploreThenExploit(time_horizon,number_of_arms,number_of_exploration_per_arm)
        epsilon_greedy_constant_half_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[1/2]*time_horizon)
        epsilon_greedy_constant_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[number_of_exploration_per_arm*number_of_arms/time_horizon]*time_horizon)
        ubc_agent = UBC1Agent(time_horizon,number_of_arms)
        se_agent = SuccessiveEliminationAgent(time_horizon,number_of_arms) 
        random_agent = Agent()
        
        bandit = Bandit(time_horizon,number_of_arms,random_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,exp_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_epsilonAgent)
        results.append(mc_simulate(n_sim,bandit,"constant-epsilon=rate-of-explore-exploit"))

        bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_half_epsilonAgent)
        results.append(mc_simulate(n_sim,bandit,"constant-epsilon=0.5"))

        bandit = Bandit(time_horizon,number_of_arms,ubc_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,se_agent)
        results.append(mc_simulate(n_sim,bandit))

        plot(results,time_horizon,params[i])
 def __init__(self, models, n, a, task=None):
     self.models = models
     self.n = n
     self.a = a
     self.num_arms = models.shape[1]
     self.num_models = models.shape[0]
     self.counts = np.zeros([self.num_arms])
     self.means = np.zeros([self.num_arms])
     if task is None:
         self.task = np.random.choice(self.num_models)
     else:
         self.task = task
     self.bandit = Bandit(self.models[self.task])
示例#26
0
def run(algorithm, non_stationary=False):
    """Runs an algorithm with the specified paramters.

    algorithm: Instance of an algorithm from `algorithms.py`.
    non_stationary: Whether to run the stationary or non_stationary test bench.
    """
    with open('config.yml') as cfile: 
        config = y.load(cfile)['run']
    runs, steps = config['runs'], config['steps']

    avg_rewards, optim_action_percent = np.zeros(steps), np.zeros(steps)

    for run in range(runs):
        bandit = Bandit(non_stationary)
        print(f'Run number {run + 1}.')

        # One-run rewards
        or_rewards = []
        # One-run actions
        or_actions = []
        optim_action = np.argmax([bandit.q_star(a) for a in range(10)])
        # One-run optimal actions
        or_optim_actions = [] if non_stationary else optim_action

        for step in range(1, steps + 1):
            action = algorithm.act(step)
            reward = bandit(action)
            if non_stationary:
                optim_action = np.argmax([bandit.q_star(a) for a in range(10)])
            algorithm.update(action, reward, step)

            or_rewards.append(reward)
            or_actions.append(action)
            if non_stationary:
                or_optim_actions.append(optim_action)
        
        avg_rewards += or_rewards

        if non_stationary:
            a, o = np.array(or_actions), np.array(or_optim_actions)
        else:
            a, o = np.array(or_actions), or_optim_actions
        optim_action_percent += (a == o)
        
        algorithm.reset()
    
    avg_rewards /= runs
    optim_action_percent = (optim_action_percent / runs) * 100.0

    return avg_rewards, optim_action_percent
示例#27
0
 def __init__(self, agent, k=10, stationary=True):
     print(f'Initialising {k} Bandits')
     self.k = k
     self.timestep = 0
     self.player = agent
     self.stationary = stationary
     if agent.character == 'optimistic':
         self.values = [np.random.randint(50, 100)] * self.k
     else:
         self.values = np.zeros(self.k)
     mu = [np.random.randint(10) for _ in range(self.k)]
     sig = [np.random.rand() for _ in range(self.k)]
     print(f"Initial actual average rewards are {mu}")
     self.bandits = [
         Bandit(mean, sd, stationary) for (mean, sd) in zip(mu, sig)
     ]
示例#28
0
def run_optimistic(number_bandits, iterations):
    bandits = [Bandit(i + 1, 10) for i in range(number_bandits)]
    data = np.empty(iterations)
    for i in range(iterations):
        bandit = bandits[np.argmax([bandit.current_mean
                                    for bandit in bandits])]
        # exploit
        value = bandit.pull()
        bandit.update(value)
        data[i] = value

    cumulative_average = np.cumsum(data) / (np.arange(iterations) + 1)
    plt.plot(cumulative_average)
    for i in range(number_bandits):
        plt.plot(np.ones(iterations) * (i + 1))
    plt.xscale('log')
    plt.show()
    return cumulative_average
def compare_epsilons(
        epsilons: List[float],
        bandits_true_means: List[float],
        iterations: int,
) -> Tuple[List[EpsilonGreedyAgent], List[float]]:
    """
    Compare different epsilons for epsilon-greedy algorithm.
    """
    agents = []
    bandits = [Bandit(m) for m in bandits_true_means]

    for epsilon in epsilons:
        logger.info("Running epsilon-greedy for epsilon = %f", epsilon)
        agent = EpsilonGreedyAgent(bandits=bandits, epsilon=epsilon)
        agent.take_actions(iterations)
        agents.append(agent)

    return agents, epsilons
示例#30
0
def test_greedy(epsilon, num_iterations):
    # Problem setup
    num_bandits = 5 # set number of bandits
    m_vals = [np.random.randint(30,41) for _ in range(num_bandits)] # create True Mean values
    bandits = [Bandit(m=i) for i in m_vals] # create bandits
    data = np.empty(num_iterations) # create an empty array for output data

    # Epsilon-Greedy algorithm
    for i in range(num_iterations):
        probability = np.random.rand()
        if probability < epsilon:
            bandit = bandits[np.random.choice(num_bandits)]
        else:
            bandit = bandits[np.argmax([bandit.mean for bandit in bandits])]
        
        output = bandit.pull()
        bandit.update(output)
        data[i] = output
    
    # Get cumulative average of all spins
    cumulative_average = np.cumsum(data) / (np.arange(num_iterations) + 1)

    return cumulative_average