Exemplo n.º 1
0
def test_multi_ddpg_input():
    nb_actions = 2

    actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1')
    actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2')
    actor = Sequential()
    x = Concatenate()([actor_observation_input1, actor_observation_input2])
    x = Flatten()(x)
    x = Dense(nb_actions)(x)
    actor = Model(inputs=[actor_observation_input1, actor_observation_input2], outputs=x)

    action_input = Input(shape=(nb_actions,), name='action_input')
    critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1')
    critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2')
    x = Concatenate()([critic_observation_input1, critic_observation_input2])
    x = Concatenate()([action_input, Flatten()(x)])
    x = Dense(1)(x)
    critic = Model(inputs=[action_input, critic_observation_input1, critic_observation_input2], outputs=x)

    processor = MultiInputProcessor(nb_inputs=2)
    memory = SequentialMemory(limit=10, window_length=2)
    agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
                      nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4,
                      processor=processor)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
Exemplo n.º 2
0
class DDPG(BaseAgent):
  def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions):
    # Replay memory
    memory = SequentialMemory(limit=opt.ddpg_replay_memory_size,
                              window_length=opt.ddpg_window_length)
    self.agent = DDPGAgent(actor=actor,
                           critic=critic,
                           critic_action_input=critic_action_input,
                           memory=memory,
                           nb_actions=num_actions,
                           processor=processor,
                           batch_size=opt.ddpg_batch_size,
                           nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor,
                           nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic,
                           target_model_update=opt.ddpg_target_model_update,
                           random_process=random_process,
                           train_interval=opt.ddpg_train_interval)
    self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor),
                        keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)],
                       metrics=['mae'])

  def fit(self, env, num_steps, weights_path=None, visualize=False):
    callbacks = []
    if weights_path is not None:
      callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)]
    self.agent.fit(env=env,
                   nb_steps=num_steps,
                   action_repetition=opt.ddpg_action_repetition,
                   callbacks=callbacks,
                   log_interval=opt.log_interval,
                   test_interval=opt.test_interval,
                   test_nb_episodes=opt.test_nb_episodes,
                   test_action_repetition=opt.ddpg_action_repetition,
                   visualize=visualize,
                   test_visualize=visualize,
                   verbose=2)

  def test(self, env, num_episodes, visualize=False):
    self.agent.test(env=env,
                    nb_episodes=num_episodes,
                    action_repetition=opt.dqn_action_repetition,
                    verbose=2,
                    visualize=visualize)

  def save(self, out_dir):
    self.agent.save_weights(out_dir, overwrite=True)

  def load(self, out_dir):
    self.agent.load_weights(out_dir)
Exemplo n.º 3
0
def train_agent(env, args):
    from src.Agents import create_ddpg_actor, create_ddpg_critic, ddpg_controls, EnvironmentWrapper
    from keras.optimizers import Adam

    from rl.agents.ddpg import DDPGAgent
    from rl.policy import EpsGreedyQPolicy
    from rl.memory import SequentialMemory
    from rl.random import OrnsteinUhlenbeckProcess

    env = EnvironmentWrapper(ddpg_controls, env)

    nb_actions = 3
    actor = create_ddpg_actor(env)
    critic, action_input = create_ddpg_critic(env)

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=50000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=2000,
                      nb_steps_warmup_actor=2000,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)
    agent.compile(Adam(lr=0.5e-2, clipnorm=1.), metrics=['mae'])

    try:
        agent.load_weights(args.ai_in)
    except OSError:
        pass

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    agent.fit(env, nb_steps=20000, visualize=False, verbose=2)

    # After training is done, we save the final weights.
    agent.save_weights(args.ai_out, overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    agent.test(env, nb_episodes=1, visualize=False)
Exemplo n.º 4
0
def test_single_ddpg_input():
    nb_actions = 2

    actor = Sequential()
    actor.add(Flatten(input_shape=(2, 3)))
    actor.add(Dense(nb_actions))

    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(2, 3), name='observation_input')
    x = Concatenate()([action_input, Flatten()(observation_input)])
    x = Dense(1)(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)

    memory = SequentialMemory(limit=10, window_length=2)
    agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
                      nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
Exemplo n.º 5
0
)

total_steps = 50000

if mode == 'train':

  if test_batch > 0:
    agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps))

  max_steps = 300 * ((test_batch / 2) + 1)
  if max_steps > 1000:
    max_steps = 1000

  agent.fit(
    env, nb_steps=total_steps, visualize=True, verbose=0, callbacks=[
      EpisodeBatchCallback(
        total_steps=total_steps, current_batch=test_batch
      ),
      # VisualizerIntervalCallback(4)
      # ModelIntervalCheckpoint('weights/{}{}_{}_params.h5f'.format(ENV_NAME, label, 0), 100000)
    ],
    # nb_max_episode_steps=max_steps
  )
  agent.save_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch + 1, total_steps), overwrite=True)

if mode == 'test':

  agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps))

  agent.test(env, nb_episodes=20, visualize=True)
Exemplo n.º 6
0
x = Concatenate()([action_input, flattened_observation])
x = Dense(1, activation='linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

agent.fit(train_env,
          nb_steps=1000,
          visualize=False,
          verbose=2,
          nb_max_episode_steps=100)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format("abc"), overwrite=True)
Exemplo n.º 7
0
class DDPGLearner(BaseAgent):
    def __init__(self, name, env, grayscale, width, height):
        super(DDPGLearner, self).__init__(name=name, env=env)

        self.nb_actions = env.available_actions
        self.abs_max_reward = env.abs_max_reward
        self.mission_name = env.mission_name

        self.grayscale = grayscale
        self.width = width
        self.height = height

        self.recurrent = False  # Use LSTM
        self.batch_size = 32
        self.window_length = 4

        if tf:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            tensorflow_backend.set_session(session=sess)

        if not self.recurrent:
            self.actor, self.critic, self.action_input = Minecraft_DDPG(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)
        else:
            self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)

        # Replay memory
        self.memory = SequentialMemory(limit=1000000,
                                       window_length=self.window_length)

        # Add random noise for exploration
        self.random_process = GaussianWhiteNoiseProcess(mu=0.0,
                                                        sigma=0.5,
                                                        size=self.nb_actions)
        '''
        # We can also generate exploration noise with different parameters for each action. This is because we may want
        # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of
        # random processes, one for each action, must be passed to the agent.
        # For example:

        self.random_process = []
        self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0))  # For moving
        self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0))  # For turning
        '''

        self.processor = MalmoProcessor(self.grayscale, self.window_length,
                                        self.recurrent, self.abs_max_reward)
        self.agent = DDPGAgent(actor=self.actor,
                               critic=self.critic,
                               critic_action_input=self.action_input,
                               nb_actions=self.nb_actions,
                               memory=self.memory,
                               batch_size=self.batch_size,
                               processor=self.processor,
                               random_process=self.random_process,
                               gamma=0.99,
                               nb_steps_warmup_actor=10000,
                               nb_steps_warmup_critic=10000,
                               target_model_update=1e-3)
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

    def fit(self, env, nb_steps):
        weights_dir = 'weights/{}'.format(self.mission_name)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        weights_path = os.path.join(weights_dir, '{}'.format(self.name))
        callbacks = [
            ModelIntervalCheckpoint(weights_path, interval=10000, verbose=1)
        ]
        self.agent.fit(env,
                       nb_steps,
                       action_repetition=4,
                       callbacks=callbacks,
                       verbose=1,
                       log_interval=10000,
                       test_interval=10000,
                       test_nb_episodes=10,
                       test_action_repetition=4,
                       test_visualize=False)

    def test(self, env, nb_episodes):
        self.agent.test(env,
                        nb_episodes,
                        action_repetition=4,
                        callbacks=None,
                        verbose=1,
                        visualize=False)

    def save(self, out_dir):
        self.agent.save_weights(out_dir, overwrite=True)

    def load(self, out_dir):
        self.agent.load_weights(out_dir)
Exemplo n.º 8
0
                 nb_steps_warmup_actor=5000,
                 random_process=random_process,
                 gamma=0.9,
                 target_model_update=5e-3)

# .compile() is used to configure the model with losses and metrics.
# The learning rate of actor and critic are entered as arguments below respectfully
ddpg.compile([Adam(lr=5e-4), Adam(lr=5e-3)], metrics=['mae'])

# show the metrics of the model that can be analysed in graphs
print(ddpg.metrics_names)

# .fit() is used to train the DDPG model
# 3000 max steps specified by Christos Kouppas
# Test the agent until it reaches 1 million total steps and print the output in a verbose 2 styling.
history = ddpg.fit(env, nb_steps=1000000, visualize=False, verbose=2)

# set up variables to store agent data from training to be used in graphs
history_dict = history.history
print(history_dict.keys())
episode_rew = history.history['episode_reward']
episode_steps = history.history['nb_episode_steps']
number_steps = history.history['nb_steps']

# create a variable to store the amount of episodes completed in the 1 million step limit,
# as this is a variable amount the range will need to be able to cater to this changing number
num_of_eps = len(episode_rew)
num_of_eps = range(0, num_of_eps)

print(num_of_eps)
Exemplo n.º 9
0
# agent
# With a model, memory, and policy defined, we’re now ready to create a deep Q network Agent and send that agent those objects.
# Keras-RL provides an agent class called DDPG Agent that we can use for this, as shown in the following code:

# nb_steps_warmup: Determines how long we wait before we start doing experience replay, which if you recall, is when we actually start training the network.
# This lets us build up enough experience to build a proper minibatch.
# If you choose a value for this parameter that’s smaller than your batch size, Keras RL will sample with a replacement.

# target_model_update: The Q function is recursive and when the agent updates it’s network for Q(s,a) that update also impacts the prediction it will make for
# Q(s’, a). This can make for a very unstable network. The way most deep Q network implementations address this limitation is by using a target network, which
# is a copy of the deep Q network that isn’t trained, but rather replaced with a fresh copy every so often. The target_model_update parameter controls how often this happens.
ddpg = DDPGAgent(nb_actions=num_actions,
                 actor=model,
                 critic=critic,
                 critic_action_input=critic_action_input,
                 memory=memory,
                 nb_steps_warmup_critic=100,
                 nb_steps_warmup_actor=100,
                 random_process=random_process,
                 gamma=.99,
                 target_model_update=1e-3)
ddpg.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mae'])

ddpg.fit(env,
         nb_steps=50000,
         visualize=True,
         verbose=1,
         nb_max_episode_steps=200)

ddpg.test(env, nb_episodes=5, visualize=True)