def step(self, bandit : Bandit, epsilon= 0): ''' action selection through greedy method :param bandit: :return: ''' #flip coin to decide a greedy action or random flip_coin = self.rng.rand() if flip_coin < epsilon: #random action chosen_lever = self.rng.randint(self.n_arms) else: # find all the best levers and then sample randomly from them reward = max(self.Q) best_levers = [lever for lever in range(self.n_arms) if self.Q[lever] == reward] chosen_lever = best_levers[self.rng.randint(len(best_levers))] reward = bandit.pull(chosen_lever) self.N[chosen_lever] += 1 #Updated estimated expected reward for the best lever self.Q[chosen_lever] = self.Q[chosen_lever] + (1/self.N[chosen_lever])*(reward - self.Q[chosen_lever]) return chosen_lever, reward
class Training(TrainingBase): def __init__(self): super(Training, self).__init__() self.steps = options.get('environment/steps', 1000) self.bandit = Bandit() def episode(self, number): # get first action from agent action = self.agent.update(reward=None, state=[]) # update agent with state and reward for step in range(self.steps): reward = self.bandit.pull(action) action = self.agent.update(reward=reward, state=[]) log.info('step: %s, action: %s' % (step, action))
class PredictionMarketEnv(object): def __init__(self, predict_market, num_bids, trials, label='Multi-Armed Prediction Market Bandit'): self.predict_market = predict_market self.n_arms = predict_market.arms self.agents = predict_market.agents self.data = predict_market.dataframe self.num_bids = num_bids self.label = label self.bandit = Bandit(self.n_arms, self.data) self.trials = trials self.scores = None self.optimal = None def run(self, experiments=1, market=True): """Run the trial with or without the prediction market""" scores = np.zeros((self.trials, len(self.agents))) if market is False: for _ in range(experiments): for trial in range(self.trials): self.bandit.reset() for i, agent in enumerate(self.agents): action = agent.choose() reward, max_reward = self.bandit.pull(action) agent.observe(reward, max_reward, update=True) scores[trial, i] += reward else: for _ in range(experiments): for trial in range(self.trials): self.predict_market.reset() for i, agent in enumerate(self.agents): bids = [] for i in range(self.num_bids): bids.append(agent.bid()) bid = np.mean(bids, axis=0) # column-wise mean self.predict_market.get_bids(bid, agent.id) normal_params = self.predict_market.settle_market() arm_samples = [] for i, params in enumerate(normal_params): arm_samples.append(np.random.normal(params[0], params[1]**0.5)) action = np.argmax(arm_samples) reward, max_reward = self.bandit.pull(action) for i, agent in enumerate(self.agents): agent.current_action = action agent.observe(reward, max_reward, update=True) scores[trial, i] += reward self.scores = scores / experiments return self.scores def plot_results(self, market=True): scores = self.scores fig = plt.figure() ax = plt.subplot(111) if market: ax.set_title('Multi-Armed Bandit Market Reward') ax.plot(scores, 'b.') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.set_ylabel('Average Reward') ax.legend(['Prediction Market'], loc='center left', bbox_to_anchor=(1, 0.5)) plt.show() else: ax.set_title('Multi-Armed Bandit Rewards') ax.plot(scores, '.') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.set_ylabel('Average Reward') ax.legend(['Prediction Market'], loc='center left', bbox_to_anchor=(1, 0.5)) plt.show()