def rollout(env, t_begin, t_end, taken, gamma=0.97): snap = snapshot(env, t_begin) snap.populate(t_begin + 1, t_end, seed=clock_seed()) opt_take, opt_leave = compare_optimal(snap, t_begin + 1, t_end, set(taken)) m_take = get_n_matched(opt_take["matched"], t_begin, t_end) m_leave = get_n_matched(opt_leave["matched"], t_begin, t_end) m_take[0] = len(taken) value_leave = disc_mean(m_leave, gamma) value_take = disc_mean(m_take, gamma) return value_take / value_leave
def rollout(env, t_begin, t_end, taken, gamma=0.97): snap = snapshot(env, t_begin) snap.populate(t_begin + 1, t_end, seed=clock_seed()) snap.removed_container[t_begin].update(taken) value = greedy(snap, t_begin + 1, t_end) matched = get_n_matched(value["matched"], t_begin, t_end) matched[0] = len(taken) return disc_mean(matched, gamma)
def rollout(env, t_begin, t_end, taken, gamma): snap = snapshot(env, t_begin) snap.populate(t_begin + 1, t_end, seed=clock_seed()) snap.removed_container[t_begin].update(taken) opt = optimal(snap, t_begin + 1, t_end) opt_matched = get_n_matched(opt["matched"], t_begin, t_end) opt_matched[0] = len(taken) opt_value = disc_mean(opt_matched, gamma) # g = greedy(snap, t_begin+1, t_end) # g_matched = get_n_matched(g["matched"], t_begin, t_end) # g_matched[0] = len(taken) # g_value = disc_mean(g_matched, gamma) r = opt_value #- g_value return r
net.optim.zero_grad() train_times = list(range(t - 2 * horizon, t - horizon)) env_opt = deepcopy(env) for s in train_times: env_opt.removed_container[s].clear() for s in train_times: opt_m = optimal(env_opt, t_begin=s, t_end=s + 2 * horizon) baseline = disc_mean( get_n_matched(opt_m["matched"], s, s + 2 * horizon), disc) rs = disc_mean(rewards[s:s + horizon], disc) adv = rs - baseline actions[s].reinforce( torch.FloatTensor(actions[s].size()).fill_(adv)) env_opt.removed_container[s].update( env.removed_container[s]) autograd.backward([actions[s] for s in train_times]) net.optim.step()