Exemplo n.º 1
0
class DDPG(BaseAgent):
  def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions):
    # Replay memory
    memory = SequentialMemory(limit=opt.ddpg_replay_memory_size,
                              window_length=opt.ddpg_window_length)
    self.agent = DDPGAgent(actor=actor,
                           critic=critic,
                           critic_action_input=critic_action_input,
                           memory=memory,
                           nb_actions=num_actions,
                           processor=processor,
                           batch_size=opt.ddpg_batch_size,
                           nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor,
                           nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic,
                           target_model_update=opt.ddpg_target_model_update,
                           random_process=random_process,
                           train_interval=opt.ddpg_train_interval)
    self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor),
                        keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)],
                       metrics=['mae'])

  def fit(self, env, num_steps, weights_path=None, visualize=False):
    callbacks = []
    if weights_path is not None:
      callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)]
    self.agent.fit(env=env,
                   nb_steps=num_steps,
                   action_repetition=opt.ddpg_action_repetition,
                   callbacks=callbacks,
                   log_interval=opt.log_interval,
                   test_interval=opt.test_interval,
                   test_nb_episodes=opt.test_nb_episodes,
                   test_action_repetition=opt.ddpg_action_repetition,
                   visualize=visualize,
                   test_visualize=visualize,
                   verbose=2)

  def test(self, env, num_episodes, visualize=False):
    self.agent.test(env=env,
                    nb_episodes=num_episodes,
                    action_repetition=opt.dqn_action_repetition,
                    verbose=2,
                    visualize=visualize)

  def save(self, out_dir):
    self.agent.save_weights(out_dir, overwrite=True)

  def load(self, out_dir):
    self.agent.load_weights(out_dir)
Exemplo n.º 2
0
def train_agent(env, args):
    from src.Agents import create_ddpg_actor, create_ddpg_critic, ddpg_controls, EnvironmentWrapper
    from keras.optimizers import Adam

    from rl.agents.ddpg import DDPGAgent
    from rl.policy import EpsGreedyQPolicy
    from rl.memory import SequentialMemory
    from rl.random import OrnsteinUhlenbeckProcess

    env = EnvironmentWrapper(ddpg_controls, env)

    nb_actions = 3
    actor = create_ddpg_actor(env)
    critic, action_input = create_ddpg_critic(env)

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=50000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=2000,
                      nb_steps_warmup_actor=2000,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)
    agent.compile(Adam(lr=0.5e-2, clipnorm=1.), metrics=['mae'])

    try:
        agent.load_weights(args.ai_in)
    except OSError:
        pass

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    agent.fit(env, nb_steps=20000, visualize=False, verbose=2)

    # After training is done, we save the final weights.
    agent.save_weights(args.ai_out, overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    agent.test(env, nb_episodes=1, visualize=False)
Exemplo n.º 3
0
)

total_steps = 50000

if mode == 'train':

  if test_batch > 0:
    agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps))

  max_steps = 300 * ((test_batch / 2) + 1)
  if max_steps > 1000:
    max_steps = 1000

  agent.fit(
    env, nb_steps=total_steps, visualize=True, verbose=0, callbacks=[
      EpisodeBatchCallback(
        total_steps=total_steps, current_batch=test_batch
      ),
      # VisualizerIntervalCallback(4)
      # ModelIntervalCheckpoint('weights/{}{}_{}_params.h5f'.format(ENV_NAME, label, 0), 100000)
    ],
    # nb_max_episode_steps=max_steps
  )
  agent.save_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch + 1, total_steps), overwrite=True)

if mode == 'test':

  agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps))

  agent.test(env, nb_episodes=20, visualize=True)
Exemplo n.º 4
0
class DDPGLearner(BaseAgent):
    def __init__(self, name, env, grayscale, width, height):
        super(DDPGLearner, self).__init__(name=name, env=env)

        self.nb_actions = env.available_actions
        self.abs_max_reward = env.abs_max_reward
        self.mission_name = env.mission_name

        self.grayscale = grayscale
        self.width = width
        self.height = height

        self.recurrent = False  # Use LSTM
        self.batch_size = 32
        self.window_length = 4

        if tf:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            tensorflow_backend.set_session(session=sess)

        if not self.recurrent:
            self.actor, self.critic, self.action_input = Minecraft_DDPG(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)
        else:
            self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)

        # Replay memory
        self.memory = SequentialMemory(limit=1000000,
                                       window_length=self.window_length)

        # Add random noise for exploration
        self.random_process = GaussianWhiteNoiseProcess(mu=0.0,
                                                        sigma=0.5,
                                                        size=self.nb_actions)
        '''
        # We can also generate exploration noise with different parameters for each action. This is because we may want
        # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of
        # random processes, one for each action, must be passed to the agent.
        # For example:

        self.random_process = []
        self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0))  # For moving
        self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0))  # For turning
        '''

        self.processor = MalmoProcessor(self.grayscale, self.window_length,
                                        self.recurrent, self.abs_max_reward)
        self.agent = DDPGAgent(actor=self.actor,
                               critic=self.critic,
                               critic_action_input=self.action_input,
                               nb_actions=self.nb_actions,
                               memory=self.memory,
                               batch_size=self.batch_size,
                               processor=self.processor,
                               random_process=self.random_process,
                               gamma=0.99,
                               nb_steps_warmup_actor=10000,
                               nb_steps_warmup_critic=10000,
                               target_model_update=1e-3)
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

    def fit(self, env, nb_steps):
        weights_dir = 'weights/{}'.format(self.mission_name)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        weights_path = os.path.join(weights_dir, '{}'.format(self.name))
        callbacks = [
            ModelIntervalCheckpoint(weights_path, interval=10000, verbose=1)
        ]
        self.agent.fit(env,
                       nb_steps,
                       action_repetition=4,
                       callbacks=callbacks,
                       verbose=1,
                       log_interval=10000,
                       test_interval=10000,
                       test_nb_episodes=10,
                       test_action_repetition=4,
                       test_visualize=False)

    def test(self, env, nb_episodes):
        self.agent.test(env,
                        nb_episodes,
                        action_repetition=4,
                        callbacks=None,
                        verbose=1,
                        visualize=False)

    def save(self, out_dir):
        self.agent.save_weights(out_dir, overwrite=True)

    def load(self, out_dir):
        self.agent.load_weights(out_dir)
Exemplo n.º 5
0
#############################Print a 2D scatter graph

plt.scatter(episode_rew, episode_steps, c='r', alpha=0.5)
plt.title('A scatter graph of total steps taken with corresponding reward.')
plt.xlabel('Episode Reward')
plt.ylabel('Episode Steps')
plt.show()

plt.scatter(num_of_eps, episode_rew, c='b', alpha=0.5)
plt.title('A scatter graph of reward per episode.')
plt.xlabel('Episode')
plt.ylabel('Episode Reward')
plt.show()

#####################################################
##############Testing the model that has been trained
#####################################################
#The model is now tested for 10 episodes to see how it performs using the training it has undergone

print("Online Testing")
env.destroy
time.sleep(1)
env2 = CartPoleVREPEnv(headless=True)
time.sleep(1)
ddpg.test(env2, nb_episodes=10, visualize=False, nb_max_episode_steps=3000)

# end simulation
print('simulation ended. leaving in 5 seconds...')
time.sleep(1)
Exemplo n.º 6
0
# agent
# With a model, memory, and policy defined, we’re now ready to create a deep Q network Agent and send that agent those objects.
# Keras-RL provides an agent class called DDPG Agent that we can use for this, as shown in the following code:

# nb_steps_warmup: Determines how long we wait before we start doing experience replay, which if you recall, is when we actually start training the network.
# This lets us build up enough experience to build a proper minibatch.
# If you choose a value for this parameter that’s smaller than your batch size, Keras RL will sample with a replacement.

# target_model_update: The Q function is recursive and when the agent updates it’s network for Q(s,a) that update also impacts the prediction it will make for
# Q(s’, a). This can make for a very unstable network. The way most deep Q network implementations address this limitation is by using a target network, which
# is a copy of the deep Q network that isn’t trained, but rather replaced with a fresh copy every so often. The target_model_update parameter controls how often this happens.
ddpg = DDPGAgent(nb_actions=num_actions,
                 actor=model,
                 critic=critic,
                 critic_action_input=critic_action_input,
                 memory=memory,
                 nb_steps_warmup_critic=100,
                 nb_steps_warmup_actor=100,
                 random_process=random_process,
                 gamma=.99,
                 target_model_update=1e-3)
ddpg.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mae'])

ddpg.fit(env,
         nb_steps=50000,
         visualize=True,
         verbose=1,
         nb_max_episode_steps=200)

ddpg.test(env, nb_episodes=5, visualize=True)