def test_doubleqagent(self):

        env = gym.make('CartPole-v0')

        num_features = env.observation_space.shape[0]
        num_actions = env.action_space.n

        model = Sequential()
        model.add(Dense(16, input_dim=num_features, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(units=num_actions, activation='linear'))
        model.compile(loss='mse', optimizer=rmsprop(lr=1e-3))

        agent = DoubleDeepQAgent(env=env,
                                 model=model,
                                 policy=BoltzmannPolicy(),
                                 memory=PrioritizedMemory(maxlen=50000),
                                 metrics=[
                                     EpisodeReturn(),
                                     RollingEpisodeReturn(),
                                     CumulativeReward(),
                                     EpisodeTime()
                                 ],
                                 gamma=0.99,
                                 max_steps_per_episode=500)

        import pickle

        s = agent.__getstate__()
        t0 = pickle.dumps(agent)
        t1 = pickle.loads(t0)

        agent = DoubleDeepQAgent(env=env,
                                 model=model,
                                 policy=EpsilonGreedyPolicy(min=0.05,
                                                            max=0.5,
                                                            decay=0.999),
                                 memory=PrioritizedMemory(maxlen=50000),
                                 metrics=[
                                     EpisodeReturn(),
                                     RollingEpisodeReturn(),
                                     CumulativeReward(),
                                     EpisodeTime()
                                 ],
                                 gamma=0.99,
                                 max_steps_per_episode=1000)

        s = agent.__getstate__()
        t0 = pickle.dumps(agent)
        t1 = pickle.loads(t0)

        pass
예제 #2
0
def build_agent_with_shaping():
    env = gym.make('MountainCar-v0')

    num_features = DoubleDeepQAgent._get_space_size(env.observation_space)
    num_actions = DoubleDeepQAgent._get_space_size(env.action_space)

    model = Sequential()
    model.add(Dense(16, input_dim=num_features, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(units=num_actions, activation='linear'))
    model.compile(loss='mse', optimizer=adam())

    agent = DoubleDeepQAgent(name='AgentWithShaping',
                             env=env,
                             model=model,
                             policy=EpsilonGreedyPolicy(min=0.05,
                                                        max=0.5,
                                                        decay=0.999),
                             memory=PrioritizedMemory(maxlen=50000),
                             metrics=[
                                 EpisodeReturn(),
                                 RollingEpisodeReturn(),
                                 CumulativeReward(),
                                 EpisodeTime()
                             ],
                             gamma=0.99,
                             max_steps_per_episode=1000)
    agent.preprocess_state = shape_reward

    return agent
def build_agent(name):
    env = gym.make('LunarLander-v2')

    num_features = DoubleDeepQAgent._get_space_size(env.observation_space)
    num_actions = DoubleDeepQAgent._get_space_size(env.action_space)

    model = Sequential([
        Dense(64, input_dim=num_features, activation='relu'),
        Dense(64, activation='relu'),
        Dense(units=num_actions, activation='linear')
    ])
    model.compile(loss='mse', optimizer='sgd')

    agent = DoubleDeepQAgent(name=name,
                             env=env,
                             model=model,
                             policy=EpsilonGreedyPolicy(min=0.05,
                                                        max=0.5,
                                                        decay=0.999),
                             memory=PrioritizedMemory(maxlen=50000),
                             metrics=[
                                 EpisodeReturn(),
                                 RollingEpisodeReturn(),
                                 CumulativeReward(),
                                 EpisodeTime()
                             ],
                             gamma=0.99,
                             max_steps_per_episode=500)

    return agent
set_seed(0)
env = gym.make('CartPole-v0')

num_features = env.observation_space.shape[0]
num_actions = env.action_space.n

model = Sequential()
model.add(Dense(16, input_dim=num_features, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(units=num_actions, activation='linear'))
model.compile(loss='mse', optimizer=rmsprop(lr=1e-3))
print(model.summary())

memory = PrioritizedMemory(maxlen=50000)
from deeprl.memories import Memory
memory = Memory(maxlen=50000)
policy = BoltzmannPolicy()

agent = DoubleDeepQAgent(env=env,
                         model=model,
                         policy=policy,
                         memory=memory,
                         gamma=0.99,
                         metrics=[
                             EpisodeReturn(),
                             RollingEpisodeReturn(),
                             CumulativeReward(),
                             EpisodeTime()
                         ],
actor = Sequential([
    Dense(128, input_dim=num_features, activation='relu'),
    Dense(128, activation='relu'),
    Dense(units=num_actions, activation='tanh')
])
actor.compile(loss='mse',
              optimizer=rmsprop(lr=1e-4))  #optimizer=sgd(lr=1e-13))

critic_state_input = Input(shape=(num_features, ), name='critic_state_input')
critic_action_input = Input(shape=(num_actions, ), name='critic_action_input')
critic_merged_input = concatenate([critic_state_input, critic_action_input])
critic_h1 = Dense(128, activation='relu',
                  name='critic_h1')(critic_merged_input)
critic_h2 = Dense(128, activation='relu', name='critic_h2')(critic_h1)
critic_out = Dense(1, activation='linear', name='CriticOut')(critic_h2)
critic = Model(inputs=[critic_state_input, critic_action_input],
               outputs=[critic_out])
critic.compile(sgd(lr=1e-3, clipnorm=5.), 'mse')

memory = PrioritizedMemory(maxlen=1e6, sample_size=32)
agent = ActorCriticAgent(env=env,
                         actor=actor,
                         critic=critic,
                         memory=memory,
                         policy=NoisyPolicy(0.15, 0.5, clip=env.action_space),
                         max_steps_per_episode=500,
                         tb_path='tensorboard')

agent.train(max_episodes=10000, render_every_n=50, target_model_update=1e-4)
import deeprl.utils.metrics as metrics

import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers[0].setLevel(logging.INFO)
#set_seed(0)


env = gym.make('LunarLander-v2')

num_features = env.observation_space.shape[0]
num_actions = env.action_space.n

model = Sequential([
    Dense(64, input_dim=num_features, activation='relu'),
    Dense(64, activation='relu'),
    Dense(units=num_actions, activation='linear')
])
model.compile(loss='mse', optimizer=rmsprop(lr=0.0016, decay=0.000001, clipnorm=1.))

memory = PrioritizedMemory(maxlen=50000, sample_size=32)
policy = BoltzmannPolicy()

agent = DoubleDeepQAgent(env=env, model=model, policy=policy, memory=memory, gamma=0.99,
                         max_steps_per_episode=500, tb_path='tensorboard')

agent.train(target_model_update=1e-3, max_episodes=500, render_every_n=50)
agent.test(num_episodes=10, render_every_n=1)