Exemplo n.º 1
0
for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)


value_criterion = nn.MSELoss()
soft_q_criterion1 = nn.MSELoss()
soft_q_criterion2 = nn.MSELoss()

value_lr = 3e-4
soft_q_lr = 3e-4
policy_lr = 3e-4

value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr)
soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr)
soft_q_optimizer2 = optim.Adam(soft_q_net2.parameters(), lr=soft_q_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)


replay_buffer_size = 1000000
replay_buffer = ReplayBuffer(replay_buffer_size)


max_frames = 40000
max_steps = 500
frame_idx = 0
rewards = []
batch_size = 128

while frame_idx < max_frames:
    state = env.reset()
    episode_reward = 0
Exemplo n.º 2
0
def run_iterations(args):
    # Init model
    state_size = 16
    action_size = 4
    if args.env == "MountainCar-v0":
        state_size = 2
        action_size = 3
    if args.env == "Freeway-ram-v0":
        state_size = 128
        action_size = 3
    if args.env == "CartPole-v0":
        state_size = 4
        action_size = 2
    model = PolicyNetwork(state_size, action_size)
    optimizer = optim.Adam(model.parameters(), args.learning_rate)
    start_n = 4
    reward_per_iteration = []
    for i in range(args.max_iterations):
        # boolean for demo 
        if not args.demo:
            state = to_tensor(ENV.reset(), state_size)
        else:
            # start_n, nde state van demo pakken om als start state te gebruiken
            # hoe deze te kiezen samen met max_iterations,  elke start state paar keer doen of 1x? 
            start_state = get_start_state(ENV, args.env, start_n)
            # probleem met ene environment ENV.env.s en andere ENV.env.state; misschien elegantere oplossing?
            if args.env == "FrozenLake-v0": 
                ENV.env.s = start_state
                state = to_tensor(ENV.env.s, state_size)
            else:
                ENV.env.state = start_state
                state = to_tensor(ENV.env.state, state_size)
        reward_per_episode = []
        episode_loss = 0
        for step in range(args.max_steps):
            if args.render: ENV.render()
            action = select_action(model, state, get_epsilon(i), action_size)
            next_state, reward, done, _ = ENV.step(action) # take a random action
            # compute the q value
            q_val = compute_q_val(model, state, action)


            with torch.no_grad():  # Don't compute gradient info for the target (semi-gradient)
                next_state = to_tensor(next_state, state_size)
                target = compute_target(model, reward, next_state, done, args.discount_factor)

            # loss is measured from error between current and newly expected Q values
            loss = F.smooth_l1_loss(q_val, target)

            # backpropagation of loss to Neural Network (PyTorch magic)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            episode_loss += loss
            state = next_state
            reward_per_episode.append(reward)
            if done: break

        if i % args.print_every == 0:
            print("Reward", reward, sum(reward_per_episode))
            print("Step {:6d} with loss: {:4f}".format(i, episode_loss))
        reward_per_iteration.append(reward_per_episode)
    return reward_per_iteration