示例#1
0
文件: rl_toy.py 项目: NagisaZj/meta
def main(args, log_dir):
    learner = network(args.num_layers, args.num_hidden, args.num_bandits)
    train_envs = []
    test_envs = []
    for i in range(args.num_tasks_train):
        train_envs.append(bandit(args, i))
    for i in range(args.num_tasks_test):
        good_num = np.random.randint(0, args.num_bandits)
        test_envs.append(bandit(args, good_num))
    #learn(learner,args,train_envs,test_envs,log_dir)
    test(learner, args, train_envs, test_envs, log_dir)
    #save(args)

    return
def generate_graph(todo):
    start = time.time()
    #jeeeezzz
    inst = 0
    for i in instances:
        print(i)
        for algo in todo:
            #f = open("../data-for-graph/"+"i-"+i[-5]+"/"+str(algo)+".txt","a")
            #g = open("../data-for-graph/"+str(algo)+".txt","a")
            #g.write(i+"\n")
            print(algo)
            for hz in horizons:
                regret = 0.0
                for seed in range(50):
                    args = [i, algo, seed, epsilon, hz]
                    bandit_instance = bandit(args)
                    REG = bandit_instance.run()
                    regret += REG
                    #write file
                    #f.write(i+' '+algo+' '+str(hz)+' '+str(seed)+' '+str(REG)+'\n')
                    #print progress
                    sys.stdout.write("\rseed: %i, time elapsed %.2f" %
                                     (seed, (time.time() - start)))
                    sys.stdout.flush()
                regret /= 50.0
                print("\nhorizon:", hz, "Regret:", regret)
                #g.write(i+"  ----"+str(regret)+"\n")
            #f.close()
            #g.close()
    print("time taken:", time.time() - start)
示例#3
0
def upperConfidenceBound(ucb):
    averageRewards = np.zeros(ts)

    for num in range(run):
        bnd = bandit(variance=variance, min=limMin, max=limMax, ucb=ucb)

        for t in range(ts):
            averageRewards[t] += bnd.takeAction()

    return averageRewards / run
示例#4
0
def optimisticInitialValues(initial):
    averageRewards = np.zeros(ts)

    for num in range(run):
        bnd = bandit(variance=variance, min=limMin, max=limMax, initial=initial)

        for t in range(ts):
            averageRewards[t] += bnd.takeAction()

    return averageRewards / run
示例#5
0
def epsilonGreedyPolicy(epsilon):
    averageRewards = np.zeros(ts)

    for num in range(run):
        bnd = bandit(variance=variance, min=limMin, max=limMax, epsilon=epsilon)

        for t in range(ts):
            averageRewards[t] += bnd.takeAction()

    return averageRewards / run
示例#6
0
def generate_graph(todo):
    start = time.time()
    inst = 0
    for i in instances:
        print(i)
        x = {}
        y = {}
        for algo in todo:
            x[algo] = []
            y[algo] = []
            f = open("outputDataT2.txt", "a")
            #f = open("../data-for-graph/"+"i-"+i[-5]+"/"+str(algo)+".txt","a")
            #g = open("../data-for-graph/"+str(algo)+".txt","a")
            #g.write(i+"\n")
            #print(algo)
            for hz in horizons:
                regret = 0.0
                for seed in range(50):
                    args = [i, algo, seed, epsilon, hz]
                    bandit_instance = bandit(args)
                    REG = bandit_instance.run()
                    regret += REG
                    #write file
                    f.write(i + ', ' + algo + ', ' + str(seed) + ', ' +
                            str(epsilon) + ', ' + str(hz) + ', ' + str(REG) +
                            '\n')
                    #print progress
                    # sys.stdout.write("\rseed: %i, time elapsed %.2f" % (seed ,(time.time()-start)))
                    # sys.stdout.flush()
                regret /= 50.0
                x[algo].append(math.log(hz))
                y[algo].append(regret)
                #print("\nhorizon:", hz, "Regret:", regret)
                #g.write(i+"  ----"+str(regret)+"\n")
            f.close()
        plt.title("Task 2 Comparision for: " + i)
        plt.plot(x[todo[0]], y[todo[0]], label='thompson-sampling')
        plt.plot(x[todo[1]], y[todo[1]], label='thompson-sampling-with-hint')
        plt.legend(loc='upper left', frameon=True)
        plt.ylabel('Regret')
        plt.xlabel('Horizon (log scale)')
        plt.show()
        #g.close()
    print("time taken:", time.time() - start)
示例#7
0
def generate_graph(todo):
    start = time.time()
    #jeeeezzz
    inst = 0
    for i in instances:
        print(i)
        print(todo)
        for epsilon in epsilons:
            regret = 0.0
            for seed in range(50):
                args = [i, todo, seed, epsilon, horizon]
                bandit_instance = bandit(args)
                REG = bandit_instance.run()
                regret += REG
                sys.stdout.write("\rseed: %i, time elapsed %.2f" %
                                 (seed, (time.time() - start)))
                sys.stdout.flush()
            regret /= 50.0
            print("\nepsilon:", epsilon, "Regret:", regret)
    print("time taken:", time.time() - start)
示例#8
0
#asks user for number of enemies
numEnemies = int(input('How many monsters will %s battle?\n' % Hero.getName()))
#list that contains enemies
enemyList = []
#list that contains random numbers that are associated with enemies
enemyNumList = []
#sets a random seed with value 0
random.seed(0)

#randomly puts enemies in enemyList, but does so with regards to varying enemy strengths
for i in range(numEnemies):
    enemyNumList.append(random.randint(1, 14))
for i in range(numEnemies):
    Y = enemyNumList[i]
    if Y <= 5:
        enemyList.append(bandit())
    elif Y <= 9:
        enemyList.append(soldier76())
    elif Y <= 12:
        enemyList.append(zergling())
    else:
        enemyList.append(illidan())

#sets maximum health for hero as starting health
startHealth = Hero.getHealth()

#hero and enemy combat, one by one
for enemy in enemyList:
    print('\nYou have encountered a %s!' % enemy)
    enemyHealth = enemy.getHealth()
    while True:
示例#9
0
import numpy as np
import bandit as b
import matplotlib.pyplot as plt

#constants
numArms = 10
excess = 0.2
neuronsPerArm = 1
epsilon = 0.08
tEpoch = 128
epochs = 2000

#setup and run bandit
p_reward = b.pick_weights(numArms, excess)
bestarm = np.argmax(p_reward)
btest = b.bandit(numArms, neuronsPerArm, tEpoch, epochs, probabilities=p_reward, epsilon=epsilon, recordWeights=False)
(choices, rewards, spikes) = btest.run(epochs)
btest.stop()

print("Plotting results...")

#plot
fig = plt.figure()
plt.scatter(np.arange(epochs), choices, alpha=0.1)
plt.ylabel("Arm Chosen")
plt.xlabel("Time step")
fig.savefig("choices_over_time.png")

fig = plt.figure()
plt.plot(p_reward/100)
plt.hist(choices, bins=np.arange(numArms)-0.5, density=True)
示例#10
0
from bandit import bandit
from epsi_greedyPolicy import epsi_greedyPolicy
from ucbPolicy import ucbPolicy

if __name__ == "__main__":
    k = 15  # Number of bandits
    NUM_CYCLES = 500  # Number of cycles

    # Initialize bandits
    bandits = []
    means = np.random.randint(-3, high=3, size=k)
    best_bandit = [0]
    best_bandit_val = means[0]
    for i in range(k):
        bandits.append(bandit(means[i], 1))
        if means[i] > best_bandit_val:
            best_bandit = [i]
            best_bandit_val = means[i]
        elif means[i] == best_bandit_val:
            best_bandit.append(i)

    INITIAL_VAL = 0
    policies = [
        epsi_greedyPolicy(0.5, range(k), INITIAL_VAL),
        epsi_greedyPolicy(0.1, range(k), INITIAL_VAL),
        epsi_greedyPolicy(0.01, range(k), INITIAL_VAL),
        epsi_greedyPolicy(0.0, range(k), INITIAL_VAL),
        ucbPolicy(1, range(k), INITIAL_VAL)
    ]
    num_policies = len(policies)