Пример #1
0
def rollout(env, t_begin, t_end, taken, gamma=0.97):

    snap = snapshot(env, t_begin)
    snap.populate(t_begin + 1, t_end, seed=clock_seed())

    opt_take, opt_leave = compare_optimal(snap, t_begin + 1, t_end, set(taken))

    m_take = get_n_matched(opt_take["matched"], t_begin, t_end)
    m_leave = get_n_matched(opt_leave["matched"], t_begin, t_end)
    m_take[0] = len(taken)

    value_leave = disc_mean(m_leave, gamma)
    value_take = disc_mean(m_take, gamma)

    return value_take / value_leave
Пример #2
0
def rollout(env, t_begin, t_end, taken, gamma=0.97):

    snap = snapshot(env, t_begin)
    snap.populate(t_begin + 1, t_end, seed=clock_seed())
    snap.removed_container[t_begin].update(taken)

    value = greedy(snap, t_begin + 1, t_end)
    matched = get_n_matched(value["matched"], t_begin, t_end)
    matched[0] = len(taken)

    return disc_mean(matched, gamma)
Пример #3
0
def rollout(env, t_begin, t_end, taken, gamma):

    snap = snapshot(env, t_begin)
    snap.populate(t_begin + 1, t_end, seed=clock_seed())
    snap.removed_container[t_begin].update(taken)

    opt = optimal(snap, t_begin + 1, t_end)
    opt_matched = get_n_matched(opt["matched"], t_begin, t_end)
    opt_matched[0] = len(taken)
    opt_value = disc_mean(opt_matched, gamma)

    #    g = greedy(snap, t_begin+1, t_end)
    #    g_matched = get_n_matched(g["matched"], t_begin, t_end)
    #    g_matched[0] = len(taken)
    #    g_value = disc_mean(g_matched,  gamma)

    r = opt_value  #- g_value

    return r
Пример #4
0
                net.optim.zero_grad()

                train_times = list(range(t - 2 * horizon, t - horizon))

                env_opt = deepcopy(env)

                for s in train_times:

                    env_opt.removed_container[s].clear()

                for s in train_times:

                    opt_m = optimal(env_opt, t_begin=s, t_end=s + 2 * horizon)

                    baseline = disc_mean(
                        get_n_matched(opt_m["matched"], s, s + 2 * horizon),
                        disc)

                    rs = disc_mean(rewards[s:s + horizon], disc)

                    adv = rs - baseline

                    actions[s].reinforce(
                        torch.FloatTensor(actions[s].size()).fill_(adv))

                    env_opt.removed_container[s].update(
                        env.removed_container[s])

                autograd.backward([actions[s] for s in train_times])

                net.optim.step()