Exemplo n.º 1
0

state, info = env.reset()
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

r, d, ep_ret, ep_len = 0, False, 0, 0

update_times = 0

for step in range(1, MAX_STEPS + 1):

    if step == 732:
        pdb.set_trace()

    with torch.no_grad():
        action = net.get_action(state)
        action = action.squeeze(0)
        #pdb.set_trace()
        next_state, reward, done, _ = env.step(action)
        #pdb.set_trace()
        replay.append(state, action, reward, next_state, done)
        state = next_state
        if done:
            break

    #if step == 10:
    #    pdb.set_trace()

    if step % UPDATE_INTERVAL == 50:
        sample = random.sample(replay, BATCH_SIZE)
        batch = ch.ExperienceReplay(sample)
Exemplo n.º 2
0
early_stop = False
PATH = "saved_models/model_ppo_pendulum.pt"

while not early_stop:

    log_probs = []
    values = []
    states = []
    actions = []
    rewards = []
    masks = []

    for _ in range(NB_STEP):
        state = torch.FloatTensor(state)
        value = model.predict_value(state)
        action = model.get_action(state)
        action = action.squeeze(0)
        next_state, reward, done, _ = envs.step(action)
        log_prob = model.get_log_prob(state, action)

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1))

        states.append(state)
        actions.append(action)

        state = next_state
        frame_idx += 1