示例#1
0
def rollout(env, t_begin, t_end, taken, gamma=0.97):

    snap = snapshot(env, t_begin)
    snap.populate(t_begin + 1, t_end, seed=clock_seed())
    snap.removed_container[t_begin].update(taken)

    value = greedy(snap, t_begin + 1, t_end)
    matched = get_n_matched(value["matched"], t_begin, t_end)
    matched[0] = len(taken)

    return disc_mean(matched, gamma)
def rollout(env, t_begin, t_end, taken, gamma):
    
    snap = snapshot(env, t_begin)
    snap.populate(t_begin+1, t_end, seed = clock_seed())
    snap.removed_container[t_begin].update(taken)
    
#    opt = optimal(snap, t_begin+1, t_end)
#    opt_matched = get_n_matched(opt["matched"], t_begin, t_end)
#    opt_matched[0] = len(taken)
#    opt_value = disc_mean(opt_matched,  gamma)
    
    g = greedy(snap, t_begin+1, t_end)
    g_matched = get_n_matched(g["matched"], t_begin, t_end)
    g_matched[0] = len(taken)
    g_value = disc_mean(g_matched,  gamma)
    
    r = g_value #- g_value
    
    return r
    newseed = str(np.random.randint(1e8))
    train = True
    disc = 0.1

    net = torch.load("results/RNN_50-1-abo_4386504")

    #%%

    for k in [2]:

        print("Creating environment")
        env = ABOKidneyExchange(entry_rate, death_rate, time_length, seed=k)

        print("Solving environment")
        opt = optimal(env)
        gre = greedy(env)

        o = get_n_matched(opt["matched"], 0, env.time_length)
        g = get_n_matched(gre["matched"], 0, env.time_length)

        rewards = []
        actions = []
        t = -1
        print("Beginning")
        #%%
        for t in range(env.time_length):

            living = np.array(env.get_living(t))
            if len(living) == 1:
                continue