Exemplo n.º 1
0
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release

    a = np.array([0])

    episode_rewards = []
    steps = 0
    while True:
        episode_reward = 0
        state = env.reset()
        state_img = env.render(
            mode="rgb_array")[::4, ::4, :]  # downsampling (every 4th pixel).

        while True:

            next_state, r, done, info = env.step(a[0])
            next_state_img = env.render(mode="rgb_array")[::4, ::4, :]

            episode_reward += r

            samples["state"].append(state)  # state has shape (8,)
            samples["state_img"].append(
                state_img)  # state_img has shape (100, 150, 3)
            samples["action"].append(np.array(a))
            samples["next_state"].append(next_state)
            samples["next_state_img"].append(next_state_img)
            samples["reward"].append(r)
            samples["terminal"].append(done)

            state = next_state
            state_img = (
Exemplo n.º 2
0
def plot_io_bounds(x, y, vx, vy, theta, omega, a, steps, discrete=True):
    import matplotlib.pyplot as plt

    statebox = [x, y, vx, vy, theta, omega]
    centerstate = [box[0] + .5 * (box[1] - box[0]) for box in statebox]
    envstate = [i for i in centerstate]

    # Zero order hold on actions if needed
    if discrete and isinstance(a, int):
        a = a * np.ones(steps, dtype=np.int32)
    elif not discrete:
        a = [np.array(a) for i in range(steps)]

    # System IDed model trajectory
    centerstatehist = [centerstate]
    for i in range(steps):
        centerstate = lander_dynamics(*centerstate, a=a[i], discrete=discrete)
        centerstatehist.append(centerstate)

    # Actual openai gym model trajectory
    envstatehist = [envstate]
    if discrete:
        from lunar_lander import LunarLander
        env = LunarLander()
    else:
        from lunar_lander import LunarLanderContinuous
        env = LunarLanderContinuous()
    s = env.reset(envstate)
    for i in range(steps):
        s, _, _, _ = env.step(a[i])
        envstatehist.append(s[0:6])

    # Overapproximated trajectory
    stateboxhist = [statebox]
    for i in range(steps):
        statebox = lander_box_dynamics(*statebox,
                                       a=a[i],
                                       steps=1,
                                       discrete=discrete)
        stateboxhist.append(statebox)

    centerstatehist = np.array(centerstatehist)
    envstatehist = np.array(envstatehist)
    stateboxhist = np.array(stateboxhist)

    t = np.linspace(0, steps, steps + 1)
    fig, axs = plt.subplots(6, 1, figsize=(4, 9))

    # fig.set_size_inches(5,7,forward=True)

    limits = [[-1, 1], [0, 1], [-1, 1], [-1, 1], [-np.pi / 3, np.pi / 3],
              [-.5, .5]]
    for i in range(6):
        axs[i].fill_between(t,
                            stateboxhist[:, i, 0],
                            stateboxhist[:, i, 1],
                            alpha=0.3)
        axs[i].plot(centerstatehist[:, i], 'r')
        axs[i].plot(envstatehist[:, i], 'b.')
        axs[i].set_ylim(bottom=limits[i][0], top=limits[i][1])
        axs[i].set_yticks(np.linspace(limits[i][0], limits[i][1], 17),
                          minor=True)
        axs[i].grid(which='minor', alpha=.4)

    axs[0].set_title('Action {0}'.format(a))
    plt.show()
Exemplo n.º 3
0
            if key == KEY.E:  _human_agent_action = ACTIONS.index('LEFTFIRE')


        def key_release(key, mod):
            global _human_agent_action
            _human_agent_action = ACTIONS.index('NOOP')


        env.render()
        env.unwrapped.viewer.window.on_key_press = key_press
        env.unwrapped.viewer.window.on_key_release = key_release

    # create models
    if RAM:
        a = random.randrange(env.action_space.n)
        s, r, done, info = env.step(a)
        N_STATE = len(s)
        MODEL = LanderDQN if 'lunar' in opt.env else RamDQN
        policy_net = MODEL(N_STATE, N_ACTIONS).to(device)
        target_net = MODEL(N_STATE, N_ACTIONS).to(device)
    else:
        MODEL = DDQN if opt.dueling else DQN
        policy_net = MODEL(n_actions=N_ACTIONS).to(device)
        target_net = MODEL(n_actions=N_ACTIONS).to(device)

    # init target model
    target_net.load_state_dict(policy_net.state_dict())

    # setup optimizer
    optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
Exemplo n.º 4
0
    #agent = DQNAgent(state_size, action_size)
    done = False
    batch_size = 1
    if 0:
        agent.load("model.dat")
    for e in range(EPISODES):
        #agent.load("../../Downloads/model_900.h5")
        #agent.epsilon = 0.0
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        tot_rew = 0
        for time in range(300):

            #action = agent.act(state)
            action = 0
            next_state, reward, done, _ = env.step(action)
            tot_rew += reward
            if time < 100:
                save_state = next_state
            #reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            #agent.remember(state, action, reward, next_state, done)
            state = next_state
            if e % 100 == 0:
                env.render()
                print(reward)
            if done:
                env.set_state(save_state)
                #print("episode: {}/{}, score: {}, time {}, e: {:.2}"
                #      .format(e, EPISODES, tot_rew/time, time, agent.epsilon))
                break