def main(args, log_dir): learner = network(args.num_layers, args.num_hidden, args.num_bandits) train_envs = [] test_envs = [] for i in range(args.num_tasks_train): train_envs.append(bandit(args, i)) for i in range(args.num_tasks_test): good_num = np.random.randint(0, args.num_bandits) test_envs.append(bandit(args, good_num)) #learn(learner,args,train_envs,test_envs,log_dir) test(learner, args, train_envs, test_envs, log_dir) #save(args) return
def generate_graph(todo): start = time.time() #jeeeezzz inst = 0 for i in instances: print(i) for algo in todo: #f = open("../data-for-graph/"+"i-"+i[-5]+"/"+str(algo)+".txt","a") #g = open("../data-for-graph/"+str(algo)+".txt","a") #g.write(i+"\n") print(algo) for hz in horizons: regret = 0.0 for seed in range(50): args = [i, algo, seed, epsilon, hz] bandit_instance = bandit(args) REG = bandit_instance.run() regret += REG #write file #f.write(i+' '+algo+' '+str(hz)+' '+str(seed)+' '+str(REG)+'\n') #print progress sys.stdout.write("\rseed: %i, time elapsed %.2f" % (seed, (time.time() - start))) sys.stdout.flush() regret /= 50.0 print("\nhorizon:", hz, "Regret:", regret) #g.write(i+" ----"+str(regret)+"\n") #f.close() #g.close() print("time taken:", time.time() - start)
def upperConfidenceBound(ucb): averageRewards = np.zeros(ts) for num in range(run): bnd = bandit(variance=variance, min=limMin, max=limMax, ucb=ucb) for t in range(ts): averageRewards[t] += bnd.takeAction() return averageRewards / run
def optimisticInitialValues(initial): averageRewards = np.zeros(ts) for num in range(run): bnd = bandit(variance=variance, min=limMin, max=limMax, initial=initial) for t in range(ts): averageRewards[t] += bnd.takeAction() return averageRewards / run
def epsilonGreedyPolicy(epsilon): averageRewards = np.zeros(ts) for num in range(run): bnd = bandit(variance=variance, min=limMin, max=limMax, epsilon=epsilon) for t in range(ts): averageRewards[t] += bnd.takeAction() return averageRewards / run
def generate_graph(todo): start = time.time() inst = 0 for i in instances: print(i) x = {} y = {} for algo in todo: x[algo] = [] y[algo] = [] f = open("outputDataT2.txt", "a") #f = open("../data-for-graph/"+"i-"+i[-5]+"/"+str(algo)+".txt","a") #g = open("../data-for-graph/"+str(algo)+".txt","a") #g.write(i+"\n") #print(algo) for hz in horizons: regret = 0.0 for seed in range(50): args = [i, algo, seed, epsilon, hz] bandit_instance = bandit(args) REG = bandit_instance.run() regret += REG #write file f.write(i + ', ' + algo + ', ' + str(seed) + ', ' + str(epsilon) + ', ' + str(hz) + ', ' + str(REG) + '\n') #print progress # sys.stdout.write("\rseed: %i, time elapsed %.2f" % (seed ,(time.time()-start))) # sys.stdout.flush() regret /= 50.0 x[algo].append(math.log(hz)) y[algo].append(regret) #print("\nhorizon:", hz, "Regret:", regret) #g.write(i+" ----"+str(regret)+"\n") f.close() plt.title("Task 2 Comparision for: " + i) plt.plot(x[todo[0]], y[todo[0]], label='thompson-sampling') plt.plot(x[todo[1]], y[todo[1]], label='thompson-sampling-with-hint') plt.legend(loc='upper left', frameon=True) plt.ylabel('Regret') plt.xlabel('Horizon (log scale)') plt.show() #g.close() print("time taken:", time.time() - start)
def generate_graph(todo): start = time.time() #jeeeezzz inst = 0 for i in instances: print(i) print(todo) for epsilon in epsilons: regret = 0.0 for seed in range(50): args = [i, todo, seed, epsilon, horizon] bandit_instance = bandit(args) REG = bandit_instance.run() regret += REG sys.stdout.write("\rseed: %i, time elapsed %.2f" % (seed, (time.time() - start))) sys.stdout.flush() regret /= 50.0 print("\nepsilon:", epsilon, "Regret:", regret) print("time taken:", time.time() - start)
#asks user for number of enemies numEnemies = int(input('How many monsters will %s battle?\n' % Hero.getName())) #list that contains enemies enemyList = [] #list that contains random numbers that are associated with enemies enemyNumList = [] #sets a random seed with value 0 random.seed(0) #randomly puts enemies in enemyList, but does so with regards to varying enemy strengths for i in range(numEnemies): enemyNumList.append(random.randint(1, 14)) for i in range(numEnemies): Y = enemyNumList[i] if Y <= 5: enemyList.append(bandit()) elif Y <= 9: enemyList.append(soldier76()) elif Y <= 12: enemyList.append(zergling()) else: enemyList.append(illidan()) #sets maximum health for hero as starting health startHealth = Hero.getHealth() #hero and enemy combat, one by one for enemy in enemyList: print('\nYou have encountered a %s!' % enemy) enemyHealth = enemy.getHealth() while True:
import numpy as np import bandit as b import matplotlib.pyplot as plt #constants numArms = 10 excess = 0.2 neuronsPerArm = 1 epsilon = 0.08 tEpoch = 128 epochs = 2000 #setup and run bandit p_reward = b.pick_weights(numArms, excess) bestarm = np.argmax(p_reward) btest = b.bandit(numArms, neuronsPerArm, tEpoch, epochs, probabilities=p_reward, epsilon=epsilon, recordWeights=False) (choices, rewards, spikes) = btest.run(epochs) btest.stop() print("Plotting results...") #plot fig = plt.figure() plt.scatter(np.arange(epochs), choices, alpha=0.1) plt.ylabel("Arm Chosen") plt.xlabel("Time step") fig.savefig("choices_over_time.png") fig = plt.figure() plt.plot(p_reward/100) plt.hist(choices, bins=np.arange(numArms)-0.5, density=True)
from bandit import bandit from epsi_greedyPolicy import epsi_greedyPolicy from ucbPolicy import ucbPolicy if __name__ == "__main__": k = 15 # Number of bandits NUM_CYCLES = 500 # Number of cycles # Initialize bandits bandits = [] means = np.random.randint(-3, high=3, size=k) best_bandit = [0] best_bandit_val = means[0] for i in range(k): bandits.append(bandit(means[i], 1)) if means[i] > best_bandit_val: best_bandit = [i] best_bandit_val = means[i] elif means[i] == best_bandit_val: best_bandit.append(i) INITIAL_VAL = 0 policies = [ epsi_greedyPolicy(0.5, range(k), INITIAL_VAL), epsi_greedyPolicy(0.1, range(k), INITIAL_VAL), epsi_greedyPolicy(0.01, range(k), INITIAL_VAL), epsi_greedyPolicy(0.0, range(k), INITIAL_VAL), ucbPolicy(1, range(k), INITIAL_VAL) ] num_policies = len(policies)