def main():
    env = gym.make("PongNoFrameskip-v4")
    # Remove Scaled Float Frame wrapper, re-use if needed.
    from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
    env = ScaledFloatFrame(wrap_dqn(env))
    model = cnn_to_dist(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                        hiddens=[256],
                        num_atoms=50,
                        dueling=True)
Пример #2
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    # Remove Scaled Float Frame wrapper, re-use if needed.
    from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
    env = ScaledFloatFrame(wrap_dqn(env))
    # model = cnn_to_mlp(
    #     convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
    #     hiddens=[256],
    #     dueling=True
    # )
    # act = learn(
    #     env,
    #     q_func=model,
    #     lr=1e-4,
    #     max_timesteps=2000000,
    #     buffer_size=10000,
    #     exploration_fraction=0.1,
    #     exploration_final_eps=0.01,
    #     train_freq=4,
    #     learning_starts=10000,
    #     target_network_update_freq=1000,
    #     gamma=0.99,
    #     prioritized_replay=False
    # )    
    num_atoms = 51
    model = cnn_to_dist(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[512],
        num_atoms=num_atoms,
        dueling=False
    )
    act = dist_learn(
        env,
        q_dist_func=model,
        num_atoms=num_atoms,
        V_max=10.0,
        lr=1e-4,
        max_timesteps=20000000,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=False
    )    

    act.save("pong_model.pkl")
    env.close()
Пример #3
0
def main():

    #    env = gym.make("CartPole-v0")
    #    env = gym.make("CartPole-v1")
    #    env = gym.make("Acrobot-v1")
    #    env = gym.make("MountainCar-v0")
    #    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))

    #    robShape = (2,)
    #    robShape = (3,)
    #    robShape = (200,)
    #    robShape = (16,)
    #    robShape = (64,)
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
#        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

# these params are specific to frozenlake

    def getOneHotObs(obs):
        #        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs, :]

#    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                              hiddens=[256],
                              dueling=True)

    # parameters
    q_func = model
    #    lr=1e-3
    lr = 1e-4
    max_timesteps = 2000000
    #    max_timesteps=100000
    #    max_timesteps=50000
    #    buffer_size=50000
    buffer_size = 100000
    exploration_fraction = 0.1
    #    exploration_fraction=0.3
    exploration_final_eps = 0.01
    #    exploration_final_eps=0.02
    #    exploration_final_eps=0.1
    #    train_freq=1
    train_freq = 4
    batch_size = 32
    print_freq = 10
    checkpoint_freq = 10000
    #    learning_starts=1000
    learning_starts = 10000
    #    gamma=1.0
    gamma = 0.99
    #    target_network_update_freq=500
    target_network_update_freq = 1000
    #    prioritized_replay=False
    prioritized_replay = True
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    num_cpu = 16

    #    # try mountaincar w/ different input dimensions
    #    inputDims = [50,2]

    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    #    obs = np.r_[env.reset(),0]
    #    obs = getOneHotObs(obs)

    #    with tempfile.TemporaryDirectory() as td:
    model_saved = False
    #        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        #        new_obs = getOneHotObs(new_obs)
        #        new_obs = np.r_[new_obs,0]

        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            #            obs = getOneHotObs(obs)
            #            obs = np.r_[obs,0]
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                              weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:

            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)

        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))))


#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess

    plt.plot(episode_rewards)
    plt.show()

    sess
Пример #4
0
#!/usr/bin/env python

import gym
import numpy as np
from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame

def wrap_train(env):
    from atari_wrappers import (wrap_deepmind, FrameStack)
    env = wrap_deepmind(env, episode_life = False, clip_rewards=False)
    env = FrameStack(env, 4)
    return env

env = gym.make("PongNoFrameskip-v4")
env = ScaledFloatFrame(wrap_dqn(env))
# env = wrap_train(env)
obs = env.reset()

print env.observation_space
print env.action_space

print len(obs), len(obs[0]), len(obs[0][0]) 
action = env.action_space.sample()
print action

# print len(observation)
# for _ in range(1000):
#     # env.render()
#     action = env.action_space.sample()  # your agent here (this takes random actions)
#     observation, reward, done, info = env.step(action)
#     print action
#     if done: