Exemplo n.º 1
0
    while episode < max_episodes:
        episode += 1
        total_reward = 0
        terminal = False
        step = 0
        state = convert(env.reset())
        history = History(history_depth, (1, 128))

        tmp_observations = []
        while not terminal:
            step += 1
            with t.no_grad():
                history.append(state)
                # agent model inference
                action = ppo.act({"mem": history.get()})[0]
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                old_history = history.get()
                new_history = history.append(state).get()
                tmp_observations.append({
                    "state": {
                        "mem": old_history
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": new_history
Exemplo n.º 2
0
        episode += 1
        total_reward = 0
        terminal = False
        step = 0
        hidden = t.zeros([1, 1, 256])
        state = convert(env.reset())

        tmp_observations = []
        while not terminal:
            step += 1
            with t.no_grad():
                old_state = state
                # agent model inference
                old_hidden = hidden
                action, _, _, hidden = rppo.act({
                    "mem": state,
                    "hidden": hidden
                })
                state, reward, terminal, _ = env.step(action.item())
                state = convert(state)
                total_reward += reward

                tmp_observations.append({
                    "state": {
                        "mem": old_state,
                        "hidden": old_hidden
                    },
                    "action": {
                        "action": action
                    },
                    "next_state": {
                        "mem": state,
Exemplo n.º 3
0
    smoothed_total_reward = 0

    while episode < max_episodes:
        episode += 1
        total_reward = 0
        terminal = False
        step = 0
        state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim)

        tmp_observations = []
        while not terminal and step <= max_steps:
            step += 1
            with t.no_grad():
                old_state = state
                # agent model inference
                action = ppo.act({"state": old_state})[0]
                state, reward, terminal, _ = env.step(action.item())
                state = t.tensor(state, dtype=t.float32).view(1, observe_dim)
                total_reward += reward

                tmp_observations.append({
                    "state": {"state": old_state},
                    "action": {"action": action},
                    "next_state": {"state": state},
                    "reward": reward,
                    "terminal": terminal or step == max_steps
                })

        # update
        ppo.store_episode(tmp_observations)
        ppo.update()