Python Bandit示例，Bandit.Bandit Python示例

示例#1

0

显示文件

文件： run.py 项目： MyDuan/SomeUsefulScript

def optimistic_initial_values(runs=2000, time=1000):
    bandits = []
    bandits.append(Bandit(epsilon=0, initial=5, step_size=0.1))
    bandits.append(Bandit(epsilon=0.1, initial=0, step_size=0.1))
    best_action_counts, _ = simulate(runs, time, bandits)

    plt.plot(best_action_counts[0], label='epsilon = 0, q = 5')
    plt.plot(best_action_counts[1], label='epsilon = 0.1, q = 0')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig('../images/figure_2_3.png')
    plt.close()

示例#2

0

显示文件

文件： run.py 项目： MyDuan/SomeUsefulScript

def UCB(runs=2000, time=1000):
    bandits = []
    bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
    bandits.append(Bandit(epsilon=0.1, sample_averages=True))
    _, average_rewards = simulate(runs, time, bandits)

    plt.plot(average_rewards[0], label='UCB c = 2')
    plt.plot(average_rewards[1], label='epsilon greedy epsilon = 0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig('../images/figure_2_4.png')
    plt.close()

示例#3

0

显示文件

def init_bandits(reward_interval_lists):
    # init six bandits with their rewards.
    bandit_list = []
    for i in range(len(reward_interval_lists)):
        b = Bandit(i, reward_interval_lists[i])
        bandit_list.append(b)
    return bandit_list

示例#4

0

显示文件

文件： upper_bounded_means.py 项目： tumble-weed/multiarmed-bandits

def run_experiment(true_means, N, upper_limit):
    bandits = []
    #     pdb.set_trace()
    for tm in true_means:
        bandits.extend([Bandit(tm, upper_limit)])
    data = np.empty(N, dtype=np.float16)
    for n in range(N):
        i = np.argmax([b.est_mean for b in bandits])
        sample = bandits[i].pull()
        data[n] = sample
        bandits[i].update(sample)
    pass

    mean_winning = np.cumsum(data) / np.arange(1, N + 1)

    plt.figure()
    plt.plot(mean_winning)
    plt.title('Mean Winnings')
    colors = ['orange', 'blue', 'green']
    for b, c in zip(bandits, colors):
        #         plt.plot(np.arange(1,N+1),b.est_mean*np.ones((N,)))
        plt.fill_between(np.arange(1, N + 1),
                         b.true_mean,
                         b.est_mean * np.ones((N, )),
                         color=c)
        plt.annotate(str(b.true_mean), xy=(N - 5, b.true_mean))
        print b.est_mean, b.true_mean

示例#5

0

显示文件

文件： OnlineActiveLearning.py 项目： HTCode/SimpleML

	def __init__(self, Lx, Ly, Ux, Uy, Tx, Ty, method = "svm", budget = 1000):
		self.Lx0 = Lx[:]
		self.Ly0 = Ly[:]
		
		self.Lx = Lx
		self.Ly = Ly
		self.Ux = Ux # TODO should not be here
		self.Uy = Uy # TODO should not be here
		self.Tx = Tx # TODO should not be here
		self.Ty = Ty # TODO should not be here
		
		self.th = 0.9
		self.queried = 0
		self.queries = []
		
		self.ths = []
		self.infos = []
		self.accuracys = []
		self.clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy ); self.clf.train()
		
		self.sup_infos = [] # TODO should not be here
		self.sup_accuracys = [] # TODO should not be here
		self.sup_clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy ); self.sup_clf.train() # TODO should not be here
		
		# self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "UCB", alpha = 1 )
		self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "reinforcement", alpha = 1 )

示例#6

0

显示文件

文件： run.py 项目： MyDuan/SomeUsefulScript

def gradient_bandit(runs=2000, time=1000):
    bandits = []
    bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=False, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=False, true_reward=4))
    best_action_counts, _ = simulate(runs, time, bandits)
    labels = ['alpha = 0.1, with baseline',
              'alpha = 0.1, without baseline',
              'alpha = 0.4, with baseline',
              'alpha = 0.4, without baseline']

    for i in range(0, len(bandits)):
        plt.plot(best_action_counts[i], label=labels[i])
    plt.xlabel('Steps')
    plt.ylabel('% Optimal action')
    plt.legend()

    plt.savefig('../images/figure_2_5.png')
    plt.close()

示例#7

0

显示文件

文件： ActiveLearning.py 项目： HTCode/SimpleML

	def __init__(self, Lx, Ly, Ux, Uy, Tx, Ty, method = "svm", budget = 251, optimize = 50, datasetname="dataset"):
		self.datasetname = datasetname
		
		self.Lx = Lx
		self.Ly = Ly
		self.Ux = Ux
		self.Uy = Uy # TODO should not be here
		self.Tx = Tx # TODO should not be here
		self.Ty = Ty # TODO should not be here
		
		self.optimize = optimize
		
		self.budget = budget
		self.accuracys = []
		
		self.clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy )
		self.clf.train()
		
		self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "boltzmann" )
		# self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "UCB" )
		
		# self.mab2 = Bandit( algos = ["disag1", "disag2"], method = "boltzmann" )
		self.mab2 = Bandit( algos = ["disag1", "disag2"], method = "reinforcement" )

示例#8

0

显示文件

文件： Simulator.py 项目： hillst/CS533_proj3

def main():
    if len(sys.argv) < 3:
        print_usage()
        exit()
    num_pulls = int(sys.argv[1])
    goal = int(sys.argv[2])
    arms = [Arm(arm.split(",")) for arm in sys.argv[3:]]
    bandit = Bandit(arms)
    alg = IncrementalUniformBandit(num_pulls, bandit)
    print "NOTE: cumulative regret is incorrect right now"
    print "uniform simple regret", alg.get_simple_regret()
    print "uniform cumulative regret", alg.get_cum_regret()
    print "uniform best arm", alg.get_best_arm()
    alg = UCBBandit(num_pulls, bandit)
    print "ucb simple regret", alg.get_simple_regret()
    print "ucb cumulative regret", alg.get_cum_regret()
    print "ucb best arm", alg.get_best_arm()

示例#9

0

显示文件

文件： epsilon_greedy.py 项目： tumble-weed/multiarmed-bandits

def run_experiment(true_means, N, eps):
    """
    Making a function so that it can be run repeatedly with different settings
    """
    bandits = []
    for tm in true_means:
        bandits.extend([Bandit(tm)])
    data = np.empty(N)
    est_means = [b.est_mean for b in bandits]
    for pix in range(N):
        p = np.random.random()
        if p < eps:  # if you fall in the explore range
            j = np.random.choice(
                3)  # choose 1 out of 3 slot machines , note choosing equally,
            # and not excluding our 'best'
        else:
            j = np.argmax(est_means)
        pull_result = bandits[j].pull()
        bandits[j].update(pull_result)
        data[pix] = pull_result

    cumulative_average = np.cumsum(data) / (
        np.arange(N) + 1)  # see how the average reward has fluctuated
    plt.figure()
    plt.plot(cumulative_average)
    for tm in true_means:
        plt.plot(np.ones(N) * tm, c='orange')
        plt.annotate(str(tm), xy=(0, tm))
    for b in bandits:
        plt.plot(np.ones(N) * b.est_mean, c='blue')
        plt.annotate(str(b.est_mean), xy=(N - 1, b.est_mean))

#     plt.xscale('log')
    plt.title('eps=' + str(eps))

    plt.show()

    return cumulative_average  # will serve as a test of how quickly you get to the optimal

示例#10

0

显示文件

文件： run.py 项目： MyDuan/SomeUsefulScript

def greedy(runs=2000, time=1000):
    epsilons = [0, 0.1, 0.01]
    bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]
    best_action_counts, rewards = simulate(runs, time, bandits)

    plt.figure(figsize=(10, 20))

    plt.subplot(2, 1, 1)
    for eps, rewards in zip(epsilons, rewards):
        plt.plot(rewards, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('average reward')
    plt.legend()

    plt.subplot(2, 1, 2)
    for eps, counts in zip(epsilons, best_action_counts):
        plt.plot(counts, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig('../images/figure_2_2.png')
    plt.close()

示例#11

0

显示文件

文件： Game.py 项目： sauravghosal/spacetrader_brogrammers

 def encounter(self):
     encounterChance = random.randint(0, 1)
     if encounterChance == 1:
         if self.difficulty == "Easy":
             chance = random.randint(1, 6)
             if chance == 1:
                 randAttacker = random.randint(0, 1)
                 if randAttacker == 0:
                     if len(self.player.ship.inventory) > 0:
                         self.npc = Police()
                     else:
                         self.npc = Bandit(self.difficulty)
                 else:
                     self.npc = Bandit(self.difficulty)
             else:
                 self.npc = Trader(self.curr_region)
         if self.difficulty == "Medium":
             chance = random.randint(1, 6)
             if chance > 1 and chance < 4:
                 randAttacker = random.randint(0, 1)
                 if randAttacker == 0:
                     if len(self.player.ship.inventory) > 0:
                         self.npc = Police()
                     else:
                         self.npc = Bandit(self.difficulty)
                 else:
                     self.npc = Bandit(self.difficulty)
             else:
                 self.npc = Trader(self.curr_region)
         if self.difficulty == "Hard":
             chance = random.randint(3, 6)
             if chance > 3:
                 randAttacker = 0
                 if randAttacker == 0:
                     if len(self.player.ship.inventory) > 0:
                         self.npc = Police()
                     else:
                         self.npc = Bandit(self.difficulty)
                 else:
                     self.npc = Bandit(self.difficulty)
             else:
                 self.npc = Trader(self.curr_region)
     else:
         self.npc = None

示例#12

0

显示文件

import matplotlib.pyplot as plt

from Bandit import Bandit
from Player import Player
from EpsilonGreedyStrategy import EpsilonGreedyStrategy
from UCBStrategy import UCBStrategy
import random

bandit1 = Bandit(10)
playerA = UCBStrategy(bandit1, "A")

# From perspective of some player


def plotRegret(history):
    time = [i for i in range(3000)]
    plt.plot(history, time)


def run_simulation(max_time, player, bandit):
    curr_time = 0

    while (curr_time < max_time):
        player.playArm(curr_time)
        curr_time += 1

    print("Player estimates:{}".format(player.estimated_values))
    print("True bandit:{}".format(bandit.arms))
    print("Player's reward: {}".format(player.reward))
    plotRegret(player.cum_regret_history)

示例#13

0

显示文件

from Bandit import Bandit
from Agent import Agent
import numpy as np
import matplotlib.pyplot as plt
import pickle

#initialise variables
no_of_iterations = 2000
no_of_time_steps = 1000
all_rewards = np.zeros((no_of_time_steps, no_of_iterations))

#learn
for i in np.arange(no_of_iterations):
    bandit = Bandit(10)
    agent = Agent(bandit, no_of_time_steps, 0.1, 0)
    agent.learn()
    all_rewards[:, i] = agent.rewards_per_time
    # print(bandit.action_values)
    # print(agent.action_selection_counts)
    # print(agent.action_value_estimate)

#plot
plt.plot(np.arange(0, 1000) + 1, np.mean(all_rewards, axis=1))
plt.show()

#save to file
pickle.dump(all_rewards, open("stationary_rewards_eps0.1.pkl", "wb"))
#all_rewards = pickle.load(open("save.pkl", "rb"))

示例#14

0

显示文件

文件： ActiveLearning.py 项目： HTCode/SimpleML

class ActiveLearning:
	def __init__(self, Lx, Ly, Ux, Uy, Tx, Ty, method = "svm", budget = 251, optimize = 50, datasetname="dataset"):
		self.datasetname = datasetname
		
		self.Lx = Lx
		self.Ly = Ly
		self.Ux = Ux
		self.Uy = Uy # TODO should not be here
		self.Tx = Tx # TODO should not be here
		self.Ty = Ty # TODO should not be here
		
		self.optimize = optimize
		
		self.budget = budget
		self.accuracys = []
		
		self.clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy )
		self.clf.train()
		
		self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "boltzmann" )
		# self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "UCB" )
		
		# self.mab2 = Bandit( algos = ["disag1", "disag2"], method = "boltzmann" )
		self.mab2 = Bandit( algos = ["disag1", "disag2"], method = "reinforcement" )
		# self.mab2 = Bandit( algos = ["disag1", "disag2"], method = "EXP3" )
		
	#---------------------------------------
	def train(self, mtd = "margin", backupfile = "backupfile.txt"): # TODO implement sample_weight + make method to shuffle and return sublist with data_limit
		for i in range(self.budget):
			if len(self.Ux) <= 1: break
			
			
			if mtd == "margin": ids, scores = self.query_margin()
			if mtd == "proba": ids, scores = self.query_proba()
			if mtd == "entropy": ids, scores = self.query_entropy()
			if mtd == "random": ids, scores = self.query_random()
			if mtd == "weight": ids, scores = self.query_sufficient_weight()
			if mtd == "eer": ids, scores = self.query_eer()
			if mtd == "dist": ids, scores = self.query_sufficient_distance()
			if mtd == "disag1": ids, scores = self.query_disagreement1()
			if mtd == "disag2": ids, scores = self.query_disagreement2()
			if mtd == "disag3": ids, scores = self.query_disagreement3()
			if mtd == "balance": ids, scores = self.query_balance()
			if mtd == "balanced_disag1": ids, scores = self.query_balanced_disag1()
			if mtd == "balanced_disag2": ids, scores = self.query_balanced_disag2()
			if mtd == "disag1_balanced": ids, scores = self.query_disag1_balanced()
			if mtd == "disag2_balanced": ids, scores = self.query_disag2_balanced()
			if mtd == "exp": ids, scores = self.query_explote_explore()
			if mtd == "test": ids, scores = self.query_disagreement_test()
			
			id = ids[0]
			
			qx = self.Ux[id]
			qy = self.Uy[id]
			
			self.Lx.append(qx)
			self.Ly.append(qy)
			self.Ux.pop(id)
			self.Uy.pop(id)
			
			self.clf.X = self.Lx; self.clf.Y = self.Ly
			self.clf.train()
			
			test_accuracy = self.clf.getTestAccuracy( self.Tx, self.Ty )
			self.accuracys.append( test_accuracy )
			
			print "i=", i+1, "-- acc=%.4f"%(test_accuracy*100), "-- %.4f"%(np.mean(self.accuracys)*100), "%.4f"%(np.average(self.accuracys, weights = range(1,1+len(self.accuracys)))*100), "--", scores[0], scores[1]
			
			if (i+1)%10 == 0:
				Util.pickleSave(backupfile, self)
				# viz = Visualize()
				# viz.plot( [range(len(self.accuracys)), self.accuracys], fig = backupfile+".png", color = 'r', marker = '-' )
				'''
				colors = ['r', 'b', 'g', 'k', 'm', 'c', '0.10', '0.35', '0.60', '0.90']
				viz.plot( zip(*self.Lx+self.Ux), fig = backupfile+"__.png", color = [colors[int(l)] for l in self.Ly+self.Uy], marker = 'o' )
				viz.do_plot( zip(*self.Ux), color = ['y']*len(self.Ux), marker = '.' )
				viz.do_plot( zip(*self.Lx), color = [colors[int(l)] for l in self.Ly], marker = 'o' )
				viz.end_plot( fig = backupfile+"_.png" )
				'''
	#---------------------------------------
	def sort_scores(self, scores):
		if sum(scores) == 0.: scores = [ self.clf.uncertainty_margin(x) for x in self.Ux ]
		
		ids = (-np.array(scores)).argsort()
		sorted_scores = [ scores[id] for id in ids ]	
		return ids, sorted_scores
		
	#---------------------------------------
	def query_margin(self):
		return self.sort_scores( [ self.clf.uncertainty_margin(x) for x in self.Ux ] )
	#---------------------------------------
	def query_proba(self):
		return self.sort_scores( [ self.clf.uncertainty_prediction(x) for x in self.Ux ] )
	#---------------------------------------
	def query_entropy(self):
		return self.sort_scores( [ self.clf.uncertainty_entropy(x) for x in self.Ux ] )
	#---------------------------------------
	def query_random(self):
		return self.sort_scores( [ random.uniform(0., 1.) for x in self.Ux ] )
	#---------------------------------------
	def query_sufficient_weight(self):
		ids, _ = self.query_margin()
		return self.sort_scores( [ self.clf.uncertainty_weight(x, self.Lx, self.Ly) if ix in ids[:self.optimize] else 0. for ix, x in enumerate(self.Ux) ] )
	#---------------------------------------
	def query_eer(self, limit_Y = 20):
		ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize]:
				YP = self.clf.predict(x, all = True)
				YP.sort(key=operator.itemgetter(1), reverse=True)
				sums = 0.
				for ir, (yy, proba) in enumerate(YP):
					if ir == limit_Y: break
					temp_clf = Classification(self.Lx + [x], self.Ly + [yy], method = self.clf.method); temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C # TODO FIXME: do it in general not specifically for svm
					temp_clf.train()
					e_h1 = sum( [ temp_clf.uncertainty_entropy(dp) for dp in self.Ux if dp != x ] )
					sums += (proba) * e_h1
					
				informativeness = 1. / sums
			else: informativeness = 0.
			scores.append( informativeness )
			
		return self.sort_scores(scores)
	#---------------------------------------
	def query_sufficient_distance(self):
		ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize]:
				y1, y2, p1, p2 = self.clf.getMarginInfo(x)
				C = [dp for idp, dp in enumerate(self.Lx) if self.Ly[idp] == y2 ]
				CDx = [Util.dist(dp, x) for idp, dp in enumerate(self.Lx) if self.Ly[idp] == y2 ]
				idsC = (np.array(CDx)).argsort(); xx = Util.medoid( [ C[idp] for idp in idsC[:1] ] )
				
				step = 0.01; lower = 0.; upper = 1.
				while (upper - lower > step):
					w = (upper + lower) / 2.
					px = np.array(x) + w * ( np.array(xx) - np.array(x) )
					
					if self.clf.predict_label(px) != y1: upper = w
					else: lower = w
					
				informativeness = 1. - w
			else: informativeness = 0.
			scores.append( informativeness )
		
		return self.sort_scores(scores)
	#---------------------------------------
	def query_disagreement1(self, weighted = False, op = 1):
		ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize*op]:
				# true_y = self.Uy[ix]
				true_y = self.clf.predict_label(x)
				
				temp_clf = Classification(self.Lx + [x], self.Ly + [true_y], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				
				if not weighted:
					diff = sum([ 1. if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
				else:
					diff = sum([ abs(temp_clf.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
					# diff = sum([ Util.dist(temp_clf.h.predict_proba(dp)[0], self.clf.h.predict_proba(dp)[0]) if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
				
				informativeness = diff
			else: informativeness = 0.
			scores.append( informativeness )
		
		return self.sort_scores(scores)
	#---------------------------------------
	def query_disagreement2(self, weighted = False, op = 1):
		ids, _ = self.query_margin()
		scores = []
		
		commitee = []
		for idp, dp in enumerate(self.Ux):
			if idp in ids[:self.optimize*op]:
				# true_y = self.Uy[idp]
				true_y = self.clf.predict_label(dp)
				
				temp_clf = Classification(self.Lx + [dp], self.Ly + [true_y], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				commitee.append( (temp_clf, 1) )
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize*op]:
				preds = Counter()
				
				if weighted: # weight using proba distrib of commitee
					for (clf,_) in commitee:
						if self.clf.predict_label(x) != clf.predict_label(x):
							YP = zip( clf.h.classes_, clf.h.predict_proba( x )[0] )
							for (y,p) in YP: preds[y] += p
					
					preds = preds.most_common()
					diff = 0. if preds == [] else preds[0][1]
					# diff = 0. if preds == [] else ( preds[0][1] - preds[1][1] if len(preds)>1 else preds[0][1] )
					
				else:
					# confis = [ clf.getPredictProba(1,x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
					labels = [ clf.predict_label(x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
					preds = Counter(labels)
					preds = preds.most_common()
					diff = 0. if preds == [] else sum( [pred[1] for pred in preds] )
				
				informativeness = diff
				
			else: informativeness = 0.
			scores.append( informativeness )
		
		return self.sort_scores(scores)
	
	#---------------------------------------
	def query_disagreement3(self):
		id_algo = self.mab2.choose()
		algo = self.mab2.algos[ id_algo ]
		print "Choosen =", algo, "nb_choices =", self.mab2.nb_choices, "mean rew=", [ np.mean(L) for L in self.mab2.rewards ]
		
		if algo == "disag1": ids, scores = self.query_disagreement1(weighted = True)
		if algo == "disag2": ids, scores = self.query_disagreement2()
		
		reward = self.get_change( self.Ux[ids[0]], self.Uy[ids[0]] )
		self.mab2.update(id_algo, reward)
		
		return ids, scores
		
	#---------------------------------------
	def query_balanced_disag1(self, weighted = True, op=1):
		ids, _ = self.query_margin()
		scores = []
		scores_B = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize*op]:
				# true_y = self.Uy[ix]
				true_y = self.clf.predict_label(x)
				
				temp_clf = Classification(self.Lx + [x], self.Ly + [true_y], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				
				if not weighted:
					diff = sum([ 1. if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
				else:
					diff = sum([ abs(temp_clf.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
					# diff = sum([ Util.dist(temp_clf.h.predict_proba(dp)[0], self.clf.h.predict_proba(dp)[0]) if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
				
				balance = self.get_balance(x)
				informativeness = diff
			else:
				informativeness = 0.
				balance = 0.
				
			scores.append( informativeness )
			scores_B.append( balance )
			
		# scores_B = Util.normalize(scores_B)
		scores = [scr*scores_B[iscr] for iscr,scr in enumerate(scores)]
		
		return self.sort_scores(scores)

	#---------------------------------------
	def query_balanced_disag2(self, weighted = True, op=1):
		ids, _ = self.query_margin()
		scores = []
		scores_B = []
		
		commitee = []
		for idp, dp in enumerate(self.Ux):
			if idp in ids[:self.optimize*op]:
				# true_y = self.Uy[idp]
				true_y = self.clf.predict_label(dp)
				
				temp_clf = Classification(self.Lx + [dp], self.Ly + [true_y], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				commitee.append( (temp_clf, 1) )
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize*op]:
				preds = Counter()
				
				if weighted: # weight using proba distrib of commitee
					for (clf,_) in commitee:
						if self.clf.predict_label(x) != clf.predict_label(x):
							YP = zip( clf.h.classes_, clf.h.predict_proba( x )[0] )
							for (y,p) in YP: preds[y] += p
					
					preds = preds.most_common()
					diff = 0. if preds == [] else preds[0][1]
					# diff = 0. if preds == [] else ( preds[0][1] - preds[1][1] if len(preds)>1 else preds[0][1] )
					
				else:
					# confis = [ clf.getPredictProba(1,x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
					labels = [ clf.predict_label(x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
					preds = Counter(labels)
					preds = preds.most_common()
					diff = 0. if preds == [] else sum( [pred[1] for pred in preds] )
				
				balance = self.get_balance(x)
				informativeness = diff
				
			else:
				informativeness = 0.
				balance = 0
				
			scores.append( informativeness )
			scores_B.append( balance )
			
		# scores_B = Util.normalize(scores_B)
		scores = [scr*scores_B[iscr] for iscr,scr in enumerate(scores)]
		
		return self.sort_scores(scores)

	#---------------------------------------
	def query_balance(self):
		ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize*4]:
				informativeness = self.get_balance(x)
				
			else: informativeness = 0.
			scores.append( informativeness )
			
		return self.sort_scores(scores)
		
	#---------------------------------------
	def query_disag1_balanced(self, weighted = True):
		ids, _ = self.query_disagreement1(weighted=weighted, op=2)
		# ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize/2]:
				informativeness = self.get_balance(x)
				
			else: informativeness = 0.
			scores.append( informativeness )
			
		return self.sort_scores(scores)
		
	#---------------------------------------
	def query_disag2_balanced(self, weighted = True):
		ids, _ = self.query_disagreement2(weighted=weighted, op=2)
		# ids, _ = self.query_margin()
		scores = []
		
		for ix, x in enumerate(self.Ux):
			if ix in ids[:self.optimize/2]:
				informativeness = self.get_balance(x)
				
			else: informativeness = 0.
			scores.append( informativeness )
			
		return self.sort_scores(scores)
		
	#---------------------------------------
	def query_explote_explore(self):
		id_eps = self.mab.choose()
		eps = self.mab.algos[ id_eps ]
		# print "Choosen = ", eps, "Expected = ", sum([ a*l for a,l in zip(self.mab.algos,self.mab.nb_choices) ]) / sum(self.mab.nb_choices)
		
		
		rnd = random.uniform(0., 1.)
		# if rnd > eps: ids, scores = self.query_disagreement1(weighted = False)
		# if rnd > eps: ids, scores = self.query_disagreement1(weighted = True)
		if rnd > eps: ids, scores = self.query_disagreement2()
		
		# else: ids, scores = self.query_balance()
		else: ids, scores = self.query_random()
		
		reward = self.get_change( self.Ux[ids[0]], self.Uy[ids[0]] )
		self.mab.update(id_eps, reward)
		
		return ids, scores

	#---------------------------------------
	#---------------------------------------
	#---------------------------------------
	#---------------------------------------
	#---------------------------------------
	#---------------------------------------
	#---------------------------------------
	def get_disag1(self, x, weighted = False):
		true_y = self.Uy[ self.Ux.index(x) ]
		# true_y = self.clf.predict_label(x)
		
		temp_clf = Classification(self.Lx + [x], self.Ly + [true_y], method = self.clf.method)
		temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
		
		if not weighted:
			diff = sum([ 1. if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
		else:
			diff = sum([ 1.-abs(temp_clf.getPredictProba(1,dp) - self.clf.getPredictProba(1,dp)) if temp_clf.predict_label(dp) != self.clf.predict_label(dp) else 0. for dp in self.Ux if dp != x ])
		
		informativeness = diff
		return informativeness
	#
	def get_disag2(self, x, commitee, weighted = False):
		preds = Counter()
		
		if weighted: # weight using proba distrib of commitee
			for (clf,_) in commitee:
				if self.clf.predict_label(x) != clf.predict_label(x):
					YP = zip( clf.h.classes_, clf.h.predict_proba( x )[0] )
					for (y,p) in YP: preds[y] += p
					
			preds = preds.most_common()
			diff = 0. if preds == [] else preds[0][1]
		else:
			# confis = [ clf.getPredictProba(1,x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
			labels = [ clf.predict_label(x) for (clf,_) in commitee if self.clf.predict_label(x) != clf.predict_label(x) ]
			preds = Counter(labels)
			
			preds = preds.most_common()
			diff = 0. if preds == [] else sum( [pred[1] for pred in preds] )
		
		informativeness = diff
		return informativeness
	#
	def query_disagreement_test(self):
		ids, _ = self.query_margin()
		scores = []
		plots_Y = []; plots_X0 = []; plots_X1 = []; plots_X2 = []; plots_X3 = []; plots_X4 = []; plots_X5 = []; plots_X6 = []; viz = Visualize()
		
		commitee = []
		for idp, dp in enumerate(self.Ux):
			if idp in ids[:self.optimize]:
				true_y = self.Uy[idp]
				# true_y = self.clf.predict_label(dp)
				
				temp_clf = Classification(self.Lx + [dp], self.Ly + [true_y], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				commitee.append( (temp_clf, 1) )
				
		# ===========================
		# sampled = random.sample(ids, 100)
		
		for ix, x in enumerate(self.Ux):
			# if ix in sampled:
			if ix in ids[:self.optimize*9999999]:
				informativeness1 = self.get_disag1(x, weighted = False)
				informativeness2 = self.get_disag2(x, commitee, weighted = False)
				informativeness3 = self.get_disag1(x, weighted = True)
				informativeness4 = self.get_disag2(x, commitee, weighted = True)
				informativeness5 = self.clf.uncertainty_prediction(x)
				informativeness6 = self.get_balance(x)
				
				temp_clf = Classification(self.Lx + [x], self.Ly + [self.Uy[ix]], method = self.clf.method)
				temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
				acc = temp_clf.getTestAccuracy( self.Tx, self.Ty )
				
				plots_X0.append( acc )
				plots_X1.append( informativeness1 )
				plots_X2.append( informativeness2 )
				plots_X3.append( informativeness3 )
				plots_X4.append( informativeness4 )
				plots_X5.append( informativeness5 )
				plots_X6.append( informativeness6 )
				plots_Y.append( 'r' if self.Uy[ix] != self.clf.predict_label(x) else 'b' )
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X1, plots_X2, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.1-2.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X3, plots_X4, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.3-4.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X1, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.1-acc.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X2, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.2-acc.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X3, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.3-acc.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X4, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.4-acc.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X5, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.5-acc.png'); plt.close()
				
				fig, axs = plt.subplots( 1, 1, sharex=True )
				axs.scatter( plots_X6, plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				plt.savefig(str(len(self.Lx)) + self.datasetname+'.6-acc.png'); plt.close()
				
				# plots = [ plots_X1, plots_X2, plots_X3, plots_X4, plots_X5, plots_X6 ]
				# fig, axs = plt.subplots( 5, 1, sharex=True )
				# axs[0].scatter( Util.normalize(plots_X1), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# axs[1].scatter( Util.normalize(plots_X2), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# axs[2].scatter( Util.normalize(plots_X3), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# axs[3].scatter( Util.normalize(plots_X4), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# axs[4].scatter( Util.normalize(plots_X5), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# axs[5].scatter( Util.normalize(plots_X6), plots_X0, c = plots_Y, marker = "o", cmap = plt.copper() )
				# plt.savefig(str(len(self.Lx)) + self.datasetname+'.png')
				# plt.close()

				informativeness = acc
			else: informativeness = 0.
			
			scores.append( informativeness )
		
		return self.sort_scores(scores)
	
	#
	def get_balance(self, x):
		# y = self.Uy[ self.Ux.index(x) ]
		y = self.clf.predict_label(x)
		
		temp_clf = Classification(self.Lx + [x], self.Ly + [y], method = self.clf.method)
		temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
		
		cnt = Counter()
		for dp in self.Ux: cnt[ temp_clf.predict_label(dp) ] += 1. / len(self.Ux)
		P = [ cnt[key] for key in cnt ]
		
		informativeness = -1.0 * sum( [ p * math.log(p, len(P)) for p in P if p > 0 ] )
		
		return informativeness
	#
	def get_change(self, x, y = None):
		if y is None:
			y = self.Uy[ self.Ux.index(x) ]
			# y = self.clf.predict_label(x)
		
		temp_clf = Classification(self.Lx + [x], self.Ly + [y], method = self.clf.method)
		temp_clf.GAMMA, temp_clf.C = self.clf.GAMMA, self.clf.C; temp_clf.train()
		
		v1 = [ self.clf.getPredictProba(1, dp) for dp in self.Ux if x != dp ]
		v2 = [ temp_clf.getPredictProba(1, dp) for dp in self.Ux if x != dp ]
		
		# informativeness = Util.dist(v1, v2)
		informativeness = math.acos( cosine_similarity(v1, v2) ) / math.pi
		
		# v1 = []; v2 = []
		# for dp in self.Ux:
			# if x != dp:
				# v1 += [ v for v in self.clf.h.predict_proba( dp )[0] ]
				# v2 += [ v for v in temp_clf.h.predict_proba( dp )[0] ]
		# informativeness = distance.cosine(v1, v2)
		
		return informativeness

示例#15

0

显示文件

文件： gradient_bandit.py 项目： sts-sadr/reinforcement-learning-experiments

def solve_bandit_randomly(bandit, timesteps=1000):
    '''
    Choose random actions on a k-arm bandit for a certain number of timesteps
    keeping track of accumulated reward. Use for benchmarking against more
    intelligent methods.
    '''
    n_steps = 0
    average_reward = 0
    for _ in range(timesteps):
        arm_i = random.randint(0, bandit.k-1)
        reward = bandit.crank_arm(arm_i)
        # update average reward
        n_steps += 1
        average_reward += 1/n_steps * (reward - average_reward)
    reward_ratio = max(average_reward / bandit.max_possible_expected_reward() * 100, 0)
    return average_reward, reward_ratio

# observe some runs
bandit = Bandit(5)
print(bandit.max_possible_expected_reward())
print(solve_bandit_randomly(bandit))
print(solve_bandit(bandit))
print(solve_bandit(bandit))
print()
bandit = Bandit(5)
print(bandit.max_possible_expected_reward())
print(solve_bandit_randomly(bandit))
print(solve_bandit(bandit))
print(solve_bandit(bandit))

示例#16

0

显示文件

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 14:45:55 2020
Upper Confidence Bound 1 (UCB1) implementation
@author: Aditya Ojha
"""
###Libraries Needed
from Bandit import Bandit
import numpy as np
import matplotlib.pyplot as plt
import random

winrates = [0.1, .4, .6, .7]
bandits = []
for winrate in winrates:
    bandits.append(Bandit(winrate))
win_list = []
total_time_steps = int(1e5)
band_n = [0.1, 0.1, 0.1, 0.1]  #number of times each bandit was choosen
win_avg = [0, 0, 0, 0]
win_UCB = [0, 0, 0, 0]
wins = 0  #total number of wins
best_bandit = 0  #guess that best bandit is zero
data = []
cum_avg = 0
for _ in range(total_time_steps):
    band_choosen = np.argmax(win_UCB)
    reward = bandits[band_choosen].pull_arm()  #pull the slot mach. arm
    data.append(reward)
    wins += reward  #update total wins
    band_n[band_choosen] += 1  #increment counter for this bandit

示例#17

0

显示文件

文件： EpsilonGreedy.py 项目： Anteloper/ReinforcementLearning


#epsilon, array of bandits, number of trials
def epsilonGreedy(epsilon, bandits, iterations):
    bestBandit = 0
    totalReward = 0
    for i in range(iterations):
        if (np.random.randn() <= epsilon):
            choice = np.random.choice(len(bandits))
            bandit = bandits[choice]
            reward = bandit.pull()
            totalReward += reward
            bandit.updateMean(reward)
            bestBandit = np.argmax([b.xBar for b in bandits])
        else:
            bandit = bandits[bestBandit]
            reward = bandit.pull()
            bandit.updateMean(reward)
            totalReward += reward
            bestBandit = np.argmax([b.xBar for b in bandits])
    return totalReward


bandits = []
for i in range(4):
    bandits.append(Bandit(i))

print(epsilonGreedy(0.1, bandits, 10000))
print(epsilonGreedy(0.01, bandits, 10000))
print(epsilonGreedy(0.001, bandits, 10000))

示例#18

0

显示文件

文件： OnlineActiveLearning.py 项目： HTCode/SimpleML

class OnlineActiveLearning:
	def __init__(self, Lx, Ly, Ux, Uy, Tx, Ty, method = "svm", budget = 1000):
		self.Lx0 = Lx[:]
		self.Ly0 = Ly[:]
		
		self.Lx = Lx
		self.Ly = Ly
		self.Ux = Ux # TODO should not be here
		self.Uy = Uy # TODO should not be here
		self.Tx = Tx # TODO should not be here
		self.Ty = Ty # TODO should not be here
		
		self.th = 0.9
		self.queried = 0
		self.queries = []
		
		self.ths = []
		self.infos = []
		self.accuracys = []
		self.clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy ); self.clf.train()
		
		self.sup_infos = [] # TODO should not be here
		self.sup_accuracys = [] # TODO should not be here
		self.sup_clf = Classification( self.Lx, self.Ly, method = method, Vx = Lx+Ux, Vy = Ly+Uy ); self.sup_clf.train() # TODO should not be here
		
		# self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "UCB", alpha = 1 )
		self.mab = Bandit( algos = np.arange(0., 1.1, 0.1), method = "reinforcement", alpha = 1 )
		
	#---------------------------------------
	def train(self, mtd = "margin", backupfile = "backupfile.txt"):
		for i, x in enumerate(self.Ux):
			y1 = self.clf.predict_label(x)
			
			if mtd == "supervised": informativeness = sys.float_info.max
			if mtd == "margin": informativeness = self.clf.uncertainty_margin(x)
			
			# ===============================
			id_th = self.mab.choose()
			self.th = self.mab.algos[ id_th ]
			print "Choosen =", self.th, "nb_choices =", self.mab.nb_choices, ("avg rwd=", [ np.mean(L) for L in self.mab.rewards ] if self.mab.rewards[0]!=[] else " "), "expected=", sum([ a*l for a,l in zip(self.mab.algos,self.mab.nb_choices) ]) / sum(self.mab.nb_choices)
			prev_clf = Classification(self.Lx, self.Ly, method = self.clf.method)
			prev_clf.GAMMA, prev_clf.C = self.clf.GAMMA, self.clf.C; prev_clf.train()
			# ===============================
			# avg_rewards = [ np.mean(L[:-20]) if len(L)>0 else 1. for L in self.mab.rewards ]
			# self.th = sum([ a*l for a,l in zip(self.mab.algos,avg_rewards) ]) / sum(avg_rewards)
			# print "Choosen =", self.th, "avg rwd=", avg_rewards
			# ===============================
			
			if informativeness > self.th:
				qx = x
				qy = self.Uy[i]
				
				self.Lx.append(qx)
				self.Ly.append(qy)
				self.queried += 1
			
				self.clf.X = self.Lx; self.clf.Y = self.Ly; self.clf.train()
			
			# ===============================
			reward = 1. - abs( 0.1 - self.queried / (i+1.) )
			self.mab.update(id_th, reward)
			# ===============================
			# for idt in range(len(self.mab.algos)):
				# reward = 1. - abs( 0.3 - (self.queried-1+1) / (i+1.) ) if informativeness > self.mab.algos[idt] else 1. - abs( 0.4 - (self.queried-1) / (i+1.) )
				# self.mab.update(idt, reward)
			# ===============================
			
			
			self.ths.append( self.th )
			self.infos.append( informativeness )
			self.accuracys.append( self.clf.getTestAccuracy( self.Tx, self.Ty ) )
			self.queries.append( self.queried )
			
			self.sup_infos.append( self.sup_clf.uncertainty_margin(x) ) # TODO should not be here
			self.sup_clf.X = self.Lx0+self.Ux[:i+1]; self.sup_clf.Y = self.Ly0+self.Uy[:i+1]; self.sup_clf.train() # TODO should not be here
			self.sup_accuracys.append( self.sup_clf.getTestAccuracy( self.Tx, self.Ty ) ) # TODO should not be here
			
			
			'''
			if i>10:
				# last_infos = self.infos[-100:] if len(self.infos) > 100 else self.infos[:]
				# self.th = np.mean( last_infos )
				
				if informativeness > self.th: # queried
					if y1 == qy: # but was correctly predicted
						self.th = self.th + 0.1 * (informativeness - self.th)
				else:
					if y1 != qy:
						self.th = self.th - 0.1 * (self.th - informativeness )
			'''
			
			
			print "i=", i+1, self.queried, self.queried / (i+1.), "-- acc=%.4f"%(self.accuracys[-1]*100), "%.4f"%(self.sup_accuracys[-1]*100), "-- %.4f"%(np.mean(self.accuracys)*100), "%.4f"%(np.average(self.accuracys, weights = range(1,1+len(self.accuracys)))*100), "--", informativeness
			
			if (i+1)%10 == 0:
				Util.pickleSave(backupfile, self); viz = Visualize()
				
				viz.do_plot( [range(len(self.infos)), self.ths], color = 'b', marker = '-' )
				viz.do_plot( [range(len(self.infos)), self.infos], color = 'r', marker = '-' )
				viz.do_plot( [range(len(self.sup_infos)), self.sup_infos], color = 'y', marker = '-' )
				viz.end_plot( fig = backupfile+"_stream_inf.png" )
				
				viz.do_plot( [range(len(self.accuracys)), self.accuracys], color = 'r', marker = '-' )
				viz.do_plot( [range(len(self.sup_accuracys)), self.sup_accuracys], color = 'y', marker = '-' )
				viz.end_plot( fig = backupfile+"_stream_acc.png" )
				
				viz.do_plot( [range(len(self.queries)), self.queries], color = 'r', marker = '-' )
				viz.do_plot( [range(len(self.queries)), range(len(self.queries))], color = 'y', marker = '-' )
				viz.end_plot( fig = backupfile+"_stream_lab.png" )
				'''
				colors = ['r', 'b', 'g', 'k', 'm', 'c', '0.10', '0.35', '0.60', '0.90']
				viz.plot( zip(*self.Lx+self.Ux), fig = backupfile+"__.png", color = [colors[int(l)] for l in self.Ly+self.Uy], marker = 'o' )
				viz.do_plot( zip(*self.Ux), color = ['y']*len(self.Ux), marker = '.' )
				viz.do_plot( zip(*self.Lx), color = [colors[int(l)] for l in self.Ly], marker = 'o' )
				viz.end_plot( fig = backupfile+"_.png" )
				'''
	
	#---------------------------------------
	def get_change(self, prev_clf, curr_clf, U):
		v1 = [ prev_clf.getPredictProba(1, dp) for dp in U ]
		v2 = [ curr_clf.getPredictProba(1, dp) for dp in U ]
		if v1 == v2: return 0.
		
		return math.acos( cosine_similarity(v1, v2) ) / math.pi

示例#19

0

显示文件

文件： driver.py 项目： sftwre/RLProj1

def main(args):

    steps = 10000
    runs = 300

    # average rewards over time steps
    avgReward1 = np.zeros(steps)
    avgReward2 = np.zeros(steps)

    # percent of optimal actions
    opActions1 = np.zeros(steps)
    opActions2 = np.zeros(steps)

    # validate command line args
    if len(args) != 2:
        print("Error: Output file not provided")
        print("Usage: driver.py result.out")
        exit()

    """
        Bandits using sample averages
    """
    for r in range(runs):
        bandit = Bandit(steps)

        for step in range(1, steps + 1):
            reward = bandit.takeStep(step)
            avgReward1[step - 1] += reward

        opActions1 += bandit.getOpActions()

    opActions1 /= runs
    avgReward1 /= runs


    """
        Bandits using step size parameters
    """
    for r in range(runs):
        bandit = Bandit(steps)

        for step in range(1, steps + 1):
            reward = bandit.takeStep(step, stepSize=True)
            avgReward2[step - 1] += reward

        opActions2 += bandit.getOpActions()

    opActions2 /= runs
    avgReward2 /= runs

    # save data in file
    try:

        fn = args[1]
        file = open(fn, 'w')
        np.savetxt(file, (avgReward1, opActions1), newline="\n")
        np.savetxt(file, (avgReward2, opActions2), newline="\n")

    except FileNotFoundError as e:
        print(f"Error: {e.strerror}")

    finally:
        file.close()

示例#20

0

显示文件

文件： explore_exploit_methods.py 项目： C-Laborde/RL_Udemy

def run_experiment(m1,
                   m2,
                   m3,
                   N,
                   method="eps",
                   eps=None,
                   decay=True,
                   upper_limit=None):
    """
    m1, m2, m3 = means of the three bandits to be compared
    eps = epsilon for Epsilon-Greedy
    upper_limit = initial value for the mean reward estimate
    N = int, the number of times we pull
    Returns the cumulative average after every play
    """
    data = np.empty(N)
    if method == "eps":
        b1 = Bandit(m1, eps=eps)
        b2 = Bandit(m2, eps=eps)
        b3 = Bandit(m3, eps=eps)
        bandits = [b1, b2, b3]

        for i in range(N):
            if decay:
                eps = 1 / (i + 0.01)
            p = np.random.random()
            # Epsilon - Greedy part
            if p < eps:
                chosen = random.choice([0, 1, 2])
                target = bandits[chosen]
            else:
                bandits_means = [bi.mean for bi in bandits]
                target = bandits[np.argmax(bandits_means)]

            new_reward = target.pull()
            target.update(new_reward)
            data[i] = new_reward
    elif method == "upper_limit" or method == "ucb1":
        b1 = Bandit(m1, upper_limit=upper_limit, method=method)
        b2 = Bandit(m2, upper_limit=upper_limit, method=method)
        b3 = Bandit(m3, upper_limit=upper_limit, method=method)
        bandits = [b1, b2, b3]

        if method == "upper_limit":
            for i in range(N):
                bandits_means = [bi.mean for bi in bandits]
                target = bandits[np.argmax(bandits_means)]

                new_reward = target.pull()
                target.update(new_reward)
                data[i] = new_reward
        elif method == "ucb1":
            for i in range(N):
                bandits_means = [
                    bi.mean + sqrt(log(bi.N) / (N + pow(10, -5)))
                    for bi in bandits
                ]
                target = bandits[np.argmax(bandits_means)]

                new_reward = target.pull()
                target.update(new_reward)
                data[i] = new_reward
    else:
        raise ValueError("The explore-exploit method chosen is not recognized")

    cumulative_avg = np.cumsum(data) / (np.arange(N) + 1)

    plt.plot(cumulative_avg, label=method)
    plt.plot(np.ones(N) * m1)
    plt.plot(np.ones(N) * m2)
    plt.plot(np.ones(N) * m3)
    plt.xlabel("Iteration")
    plt.ylabel("Reward mean")

    print(b1.mean)
    print(b2.mean)
    print(b3.mean)