示例#1
0
    def step(self, bandit : Bandit, epsilon= 0):
        '''
        action selection through greedy method
        :param bandit:
        :return:
        '''

        #flip coin to decide a greedy action or random
        flip_coin = self.rng.rand()

        if flip_coin < epsilon:
            #random action
            chosen_lever = self.rng.randint(self.n_arms)

        else:
            # find all the best levers and then sample randomly from them
            reward = max(self.Q)
            best_levers = [lever for lever in range(self.n_arms) if self.Q[lever] == reward]
            chosen_lever = best_levers[self.rng.randint(len(best_levers))]

        reward = bandit.pull(chosen_lever)

        self.N[chosen_lever] += 1
        #Updated estimated expected reward for the best lever
        self.Q[chosen_lever] = self.Q[chosen_lever] + (1/self.N[chosen_lever])*(reward - self.Q[chosen_lever])

        return chosen_lever, reward
示例#2
0
class Training(TrainingBase):
    def __init__(self):
        super(Training, self).__init__()
        self.steps = options.get('environment/steps', 1000)
        self.bandit = Bandit()

    def episode(self, number):
        # get first action from agent
        action = self.agent.update(reward=None, state=[])
        # update agent with state and reward
        for step in range(self.steps):
            reward = self.bandit.pull(action)
            action = self.agent.update(reward=reward, state=[])
            log.info('step: %s, action: %s' % (step, action))
示例#3
0
class PredictionMarketEnv(object):
    
    def __init__(self, predict_market, num_bids, trials,
                 label='Multi-Armed Prediction Market Bandit'):
        self.predict_market = predict_market
        self.n_arms = predict_market.arms
        self.agents = predict_market.agents
        self.data = predict_market.dataframe
        self.num_bids = num_bids
        self.label = label
        self.bandit = Bandit(self.n_arms, self.data)
        self.trials = trials
        self.scores = None
        self.optimal = None
        
    def run(self, experiments=1, market=True):
        """Run the trial with or without the prediction market"""
        scores = np.zeros((self.trials, len(self.agents)))
        
        if market is False:
            for _ in range(experiments):
                for trial in range(self.trials):
                    self.bandit.reset()
                    for i, agent in enumerate(self.agents):
                        action = agent.choose()
                        reward, max_reward = self.bandit.pull(action)
                        agent.observe(reward, max_reward, update=True)
                        scores[trial, i] += reward
        else:
            for _ in range(experiments):
                for trial in range(self.trials):
                    self.predict_market.reset()
                    for i, agent in enumerate(self.agents):
                        bids = []
                        for i in range(self.num_bids):
                            bids.append(agent.bid())
                        bid = np.mean(bids, axis=0)  # column-wise mean
                        self.predict_market.get_bids(bid, agent.id)
                    normal_params = self.predict_market.settle_market()
                    arm_samples = []
                    for i, params in enumerate(normal_params):
                        arm_samples.append(np.random.normal(params[0], params[1]**0.5))
                    action = np.argmax(arm_samples)
                    reward, max_reward = self.bandit.pull(action)
                    for i, agent in enumerate(self.agents):
                        agent.current_action = action
                        agent.observe(reward, max_reward, update=True)
                        scores[trial, i] += reward
        self.scores = scores / experiments
                    
        return self.scores
    
    def plot_results(self, market=True):
        scores = self.scores
        fig = plt.figure()
        ax = plt.subplot(111)
        if market:
            ax.set_title('Multi-Armed Bandit Market Reward')
            ax.plot(scores, 'b.')
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.set_ylabel('Average Reward')
            ax.legend(['Prediction Market'], loc='center left', 
                      bbox_to_anchor=(1, 0.5))
            plt.show() 
        else:
            ax.set_title('Multi-Armed Bandit Rewards')
            ax.plot(scores, '.')
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.set_ylabel('Average Reward')
            ax.legend(['Prediction Market'], loc='center left', 
                      bbox_to_anchor=(1, 0.5))
            plt.show()