示例#1
0
def generate_insurance_model(env=None,
                             lr=.0001,
                             memory_len=100,
                             target_model_update=.09):
    ins_actor = Sequential()
    ins_actor.add(Flatten(input_shape=(1, ) + (env.NUM_INSURANCES, 21)))
    ins_actor.add(Dense(NUM_HIDDEN_UNITS))
    ins_actor.add(Activation('relu'))
    ins_actor.add(Dense(NUM_HIDDEN_UNITS))
    ins_actor.add(Activation('relu'))
    ins_actor.add(Dense(NUM_HIDDEN_UNITS))
    ins_actor.add(Activation('relu'))
    ins_actor.add(Dense(1))
    ins_actor.add(Activation('softsign'))
    # print(ins_actor.summary())
    # print(ins_actor.layers[-1].activation)

    action_input = Input(shape=(1, ), name='action_input')
    observation_input = Input(shape=(1, ) + (env.NUM_INSURANCES, 21),
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(NUM_HIDDEN_UNITS)(x)
    x = Activation('relu')(x)
    x = Dense(NUM_HIDDEN_UNITS)(x)
    x = Activation('relu')(x)
    x = Dense(NUM_HIDDEN_UNITS)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('softsign')(x)
    ins_critic = Model(inputs=[action_input, observation_input], outputs=x)
    # print(ins_critic.summary(()))

    ins_memory = SequentialMemory(limit=memory_len, window_length=1)
    # ins_random_process = OrnsteinUhlenbeckProcess(size=1, theta=.15, mu=0, sigma=.3)
    ins_random_process = GaussianWhiteNoiseProcess(mu=0,
                                                   sigma=0.2,
                                                   sigma_min=0.005,
                                                   n_steps_annealing=5000)
    # ins_random_process = None
    ins_agent = DDPGAgent(nb_actions=1,
                          actor=ins_actor,
                          critic=ins_critic,
                          critic_action_input=action_input,
                          memory=ins_memory,
                          nb_steps_warmup_critic=100,
                          nb_steps_warmup_actor=100,
                          random_process=ins_random_process,
                          gamma=.99,
                          target_model_update=target_model_update)
    # ins_agent.processor = MultiInputProcessor(3)
    ins_agent.compile(Adam(lr=lr, clipnorm=1.), metrics=['mae'])

    print(type(ins_agent))

    return ins_agent
示例#2
0
class DDPG(BaseAgent):
  def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions):
    # Replay memory
    memory = SequentialMemory(limit=opt.ddpg_replay_memory_size,
                              window_length=opt.ddpg_window_length)
    self.agent = DDPGAgent(actor=actor,
                           critic=critic,
                           critic_action_input=critic_action_input,
                           memory=memory,
                           nb_actions=num_actions,
                           processor=processor,
                           batch_size=opt.ddpg_batch_size,
                           nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor,
                           nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic,
                           target_model_update=opt.ddpg_target_model_update,
                           random_process=random_process,
                           train_interval=opt.ddpg_train_interval)
    self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor),
                        keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)],
                       metrics=['mae'])

  def fit(self, env, num_steps, weights_path=None, visualize=False):
    callbacks = []
    if weights_path is not None:
      callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)]
    self.agent.fit(env=env,
                   nb_steps=num_steps,
                   action_repetition=opt.ddpg_action_repetition,
                   callbacks=callbacks,
                   log_interval=opt.log_interval,
                   test_interval=opt.test_interval,
                   test_nb_episodes=opt.test_nb_episodes,
                   test_action_repetition=opt.ddpg_action_repetition,
                   visualize=visualize,
                   test_visualize=visualize,
                   verbose=2)

  def test(self, env, num_episodes, visualize=False):
    self.agent.test(env=env,
                    nb_episodes=num_episodes,
                    action_repetition=opt.dqn_action_repetition,
                    verbose=2,
                    visualize=visualize)

  def save(self, out_dir):
    self.agent.save_weights(out_dir, overwrite=True)

  def load(self, out_dir):
    self.agent.load_weights(out_dir)
示例#3
0
    def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2,
                 memory1, memory2,
                 gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                 train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
                 random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001,
                 **kwargs):

        super(CoopDDPG, self).__init__()

        self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process1, custom_model_objects, target_model_update,
                                **kwargs)
        self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process2, custom_model_objects, target_model_update,
                                **kwargs)
示例#4
0
 def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions):
   # Replay memory
   memory = SequentialMemory(limit=opt.ddpg_replay_memory_size,
                             window_length=opt.ddpg_window_length)
   self.agent = DDPGAgent(actor=actor,
                          critic=critic,
                          critic_action_input=critic_action_input,
                          memory=memory,
                          nb_actions=num_actions,
                          processor=processor,
                          batch_size=opt.ddpg_batch_size,
                          nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor,
                          nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic,
                          target_model_update=opt.ddpg_target_model_update,
                          random_process=random_process,
                          train_interval=opt.ddpg_train_interval)
   self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor),
                       keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)],
                      metrics=['mae'])
示例#5
0
def test_multi_ddpg_input():
    nb_actions = 2

    actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1')
    actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2')
    actor = Sequential()
    x = Concatenate()([actor_observation_input1, actor_observation_input2])
    x = Flatten()(x)
    x = Dense(nb_actions)(x)
    actor = Model(inputs=[actor_observation_input1, actor_observation_input2], outputs=x)

    action_input = Input(shape=(nb_actions,), name='action_input')
    critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1')
    critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2')
    x = Concatenate()([critic_observation_input1, critic_observation_input2])
    x = Concatenate()([action_input, Flatten()(x)])
    x = Dense(1)(x)
    critic = Model(inputs=[action_input, critic_observation_input1, critic_observation_input2], outputs=x)

    processor = MultiInputProcessor(nb_inputs=2)
    memory = SequentialMemory(limit=10, window_length=2)
    agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
                      nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4,
                      processor=processor)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
示例#6
0
def test_single_ddpg_input():
    nb_actions = 2

    actor = Sequential()
    actor.add(Flatten(input_shape=(2, 3)))
    actor.add(Dense(nb_actions))

    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(2, 3), name='observation_input')
    x = Concatenate()([action_input, Flatten()(observation_input)])
    x = Dense(1)(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)

    memory = SequentialMemory(limit=10, window_length=2)
    agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
                      nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
示例#7
0
文件: ddpg.py 项目: vzhuang/vinci
env = populate_env(gym.make("MountainCarContinuous-v0"))

# Build the actor and the critic
actor = simple_actor(env)
critic = simple_critic(env)

# Memory
memory = SimpleMemory(env=env, limit=1000000)

# Noise
random_process = OrnsteinUhlenbeckProcess(
    size=env.action_space.dim, theta=.15, mu=0., sigma=3.)

# Agent
agent = DDPGAgent(
    actor=actor,
    critic=critic,
    env=env,
    memory=memory,
    random_process=random_process
)
agent.compile()

agent.train(
    env=env,
    nb_episodes=1,
    visualize=False,
    verbose=2,
    nb_max_episode_steps=200,
    plots=False)
示例#8
0
    def __init__(self, name, env, grayscale, width, height):
        super(DDPGLearner, self).__init__(name=name, env=env)

        self.nb_actions = env.available_actions
        self.abs_max_reward = env.abs_max_reward
        self.mission_name = env.mission_name

        self.grayscale = grayscale
        self.width = width
        self.height = height

        self.recurrent = False  # Use LSTM
        self.batch_size = 32
        self.window_length = 4

        if tf:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            tensorflow_backend.set_session(session=sess)

        if not self.recurrent:
            self.actor, self.critic, self.action_input = Minecraft_DDPG(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)
        else:
            self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)

        # Replay memory
        self.memory = SequentialMemory(limit=1000000,
                                       window_length=self.window_length)

        # Add random noise for exploration
        self.random_process = GaussianWhiteNoiseProcess(mu=0.0,
                                                        sigma=0.5,
                                                        size=self.nb_actions)
        '''
        # We can also generate exploration noise with different parameters for each action. This is because we may want
        # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of
        # random processes, one for each action, must be passed to the agent.
        # For example:

        self.random_process = []
        self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0))  # For moving
        self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0))  # For turning
        '''

        self.processor = MalmoProcessor(self.grayscale, self.window_length,
                                        self.recurrent, self.abs_max_reward)
        self.agent = DDPGAgent(actor=self.actor,
                               critic=self.critic,
                               critic_action_input=self.action_input,
                               nb_actions=self.nb_actions,
                               memory=self.memory,
                               batch_size=self.batch_size,
                               processor=self.processor,
                               random_process=self.random_process,
                               gamma=0.99,
                               nb_steps_warmup_actor=10000,
                               nb_steps_warmup_critic=10000,
                               target_model_update=1e-3)
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])
示例#9
0
class DDPGLearner(BaseAgent):
    def __init__(self, name, env, grayscale, width, height):
        super(DDPGLearner, self).__init__(name=name, env=env)

        self.nb_actions = env.available_actions
        self.abs_max_reward = env.abs_max_reward
        self.mission_name = env.mission_name

        self.grayscale = grayscale
        self.width = width
        self.height = height

        self.recurrent = False  # Use LSTM
        self.batch_size = 32
        self.window_length = 4

        if tf:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            tensorflow_backend.set_session(session=sess)

        if not self.recurrent:
            self.actor, self.critic, self.action_input = Minecraft_DDPG(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)
        else:
            self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM(
                self.window_length, self.grayscale, self.width, self.height,
                self.nb_actions)

        # Replay memory
        self.memory = SequentialMemory(limit=1000000,
                                       window_length=self.window_length)

        # Add random noise for exploration
        self.random_process = GaussianWhiteNoiseProcess(mu=0.0,
                                                        sigma=0.5,
                                                        size=self.nb_actions)
        '''
        # We can also generate exploration noise with different parameters for each action. This is because we may want
        # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of
        # random processes, one for each action, must be passed to the agent.
        # For example:

        self.random_process = []
        self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0))  # For moving
        self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0))  # For turning
        '''

        self.processor = MalmoProcessor(self.grayscale, self.window_length,
                                        self.recurrent, self.abs_max_reward)
        self.agent = DDPGAgent(actor=self.actor,
                               critic=self.critic,
                               critic_action_input=self.action_input,
                               nb_actions=self.nb_actions,
                               memory=self.memory,
                               batch_size=self.batch_size,
                               processor=self.processor,
                               random_process=self.random_process,
                               gamma=0.99,
                               nb_steps_warmup_actor=10000,
                               nb_steps_warmup_critic=10000,
                               target_model_update=1e-3)
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

    def fit(self, env, nb_steps):
        weights_dir = 'weights/{}'.format(self.mission_name)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        weights_path = os.path.join(weights_dir, '{}'.format(self.name))
        callbacks = [
            ModelIntervalCheckpoint(weights_path, interval=10000, verbose=1)
        ]
        self.agent.fit(env,
                       nb_steps,
                       action_repetition=4,
                       callbacks=callbacks,
                       verbose=1,
                       log_interval=10000,
                       test_interval=10000,
                       test_nb_episodes=10,
                       test_action_repetition=4,
                       test_visualize=False)

    def test(self, env, nb_episodes):
        self.agent.test(env,
                        nb_episodes,
                        action_repetition=4,
                        callbacks=None,
                        verbose=1,
                        visualize=False)

    def save(self, out_dir):
        self.agent.save_weights(out_dir, overwrite=True)

    def load(self, out_dir):
        self.agent.load_weights(out_dir)
示例#10
0
文件: main.py 项目: idthanm/h-DDPG
    (step_length * plan_horizon) * 5
)  # episode length / (times per action * min v)

# turn left agent
left_processor = WhiteningNormalizerProcessor()
left_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH)
left_random_process = OrnsteinUhlenbeckProcess(size=lower_nb_actions,
                                               theta=RANDOM_PROCESS_THETA,
                                               mu=RANDOM_PROCESS_MU,
                                               sigma=RANDOM_PROCESS_SIGMA)
left_agent = DDPGAgent(processor=left_processor,
                       nb_actions=lower_nb_actions,
                       actor=left_actor_model,
                       critic=left_critic_model,
                       critic_action_input=critic_action_input,
                       memory=left_memory,
                       nb_steps_warmup_critic=NB_STEPS_WARMUP_CRITIC,
                       nb_steps_warmup_actor=NB_STEPS_WARMUP_ACTOR,
                       random_process=left_random_process,
                       gamma=GAMMA,
                       target_model_update=TARGET_MODEL_UPDATE,
                       batch_size=BATCH_SIZE_LOWER)
left_agent.compile(Adam(lr=OPTIMIZER_LR, clipnorm=OPTIMIZER_CLIPNORM),
                   metrics=['mae'])

# go straight agent
straight_processor = WhiteningNormalizerProcessor()
straight_memory = SequentialMemory(limit=MEMORY_LIMIT,
                                   window_length=WINDOW_LENGTH)
straight_random_process = OrnsteinUhlenbeckProcess(size=lower_nb_actions,
                                                   theta=RANDOM_PROCESS_THETA,
                                                   mu=RANDOM_PROCESS_MU,
示例#11
0
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32, activation='relu')(x)
x = Dense(32, activation='tanh')(x)
x = Dense(32, activation='relu')(x)
x = Dense(1, activation='linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# memory = EpisodeParameterMemory(limit=1000000, window_length=1)
memory = SequentialMemory(limit=1000000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=400, nb_steps_warmup_actor=400,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(
  # Adam(lr=.001, clipnorm=1.),
  RMSprop(centered=True),
  metrics=['mae']
)

total_steps = 50000

if mode == 'train':

  if test_batch > 0:
    agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps))

  max_steps = 300 * ((test_batch / 2) + 1)
示例#12
0
    for experiment in my_expe.experiments(5):
        # Get the environment
        # And populate it with useful metadata
        env = populate_env(gym.make("MountainCarContinuous-v0"))

        # Build the actor and the critic
        actor = simple_actor(env)
        critic = simple_critic(env)

        # Memory
        memory = SimpleMemory(env=env, limit=1000000)

        # Noise
        random_process = OrnsteinUhlenbeckProcess(size=env.action_space.dim,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=3.)

        # Agent
        agent = DDPGAgent(
            actor=actor,
            critic=critic,
            env=env,
            memory=memory,
            random_process=random_process,
            experiment=experiment,
        )
        agent.compile()

        agent.train(episodes=1)
示例#13
0
class CoopActionOtherDDPG(Agent):  # Two Agents, who can measure the output of the other (Based on Keras-rl agent impl.)

    def forward(self, observation):
        raise NotImplementedError

    def backward(self, reward, terminal):
        raise NotImplementedError

    def load_weights(self, filepath):
        raise NotImplementedError

    def save_weights(self, filepath, overwrite=False):
        raise NotImplementedError

    @property
    def layers(self):
        raise NotImplementedError

    def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2,
                 memory1, memory2,
                 gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                 train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
                 random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001,
                 **kwargs):

        super(CoopActionOtherDDPG, self).__init__()

        self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process1, custom_model_objects, target_model_update,
                                **kwargs)
        self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process2, custom_model_objects, target_model_update,
                                **kwargs)

    def compile(self, optimizer, metrics=[]):
        self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics))
        self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics))

    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):
        """Trains the agent on the given environment.
        # Arguments
            env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """
        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        assert self.processor is None  # Removed processors here for simplification. Not needed anyway
        assert nb_max_start_steps == 0  # Removed here for simplification. Not needed anyway
        assert action_repetition == 1  # Removed here for simplification. Not needed anyway

        self.agent1.training = True
        self.agent2.training = True

        experience_for_plotting = deque()

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation1 = observation2 = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation1 is None or observation2 is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    obs = env.reset()
                    observation1 = deepcopy(obs) + (0.,)
                    observation2 = deepcopy(obs) + (0.,)

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation1 is not None
                assert observation2 is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = np.ndarray.item(self.agent1.forward(observation1))
                action2 = np.ndarray.item(self.agent2.forward(observation2))
                action = (action1, action2)
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                accumulated_info = {}
                done = False

                callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                obs, r, done, info = env.step(action)
                if done:
                    raise AttributeError  # The episode was reset unexpectedly
                    # (see https://stackoverflow.com/questions/42787924/)

                observation1 = deepcopy(obs) + (info["u2_clipped"],)  # Add action other to the observation
                observation2 = deepcopy(obs) + (info["u1_clipped"],)
                for key, value in info.items():
                    if not np.isreal(value):
                        continue
                    if key not in accumulated_info:
                        accumulated_info[key] = np.zeros_like(value)
                    accumulated_info[key] += value
                callbacks.on_action_end(action)
                reward1 += info["r1"]
                reward2 += info["r2"]

                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action[0] + action[1],
                    'observation': observation1,
                    'reward': reward1 + reward2,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if len(obs) == 2:
                    experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.),
                                                    r, (info["r1"], info["r2"])))

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation1)
                    self.agent2.forward(observation2)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation1 = None
                    observation2 = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()

        return experience_for_plotting
示例#14
0
class CoopDDPG(Agent):  # Two Agents, who can not measure the output of the other (Based on Keras-rl agent impl.)


    def forward(self, observation):
        raise NotImplementedError

    def backward(self, reward, terminal):
        raise NotImplementedError

    def load_weights(self, filepath):
        raise NotImplementedError

    def save_weights(self, filepath, overwrite=False):
        raise NotImplementedError

    @property
    def layers(self):
        raise NotImplementedError

    def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2,
                 memory1, memory2,
                 gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                 train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
                 random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001,
                 **kwargs):

        super(CoopDDPG, self).__init__()

        self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process1, custom_model_objects, target_model_update,
                                **kwargs)
        self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size,
                                nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval,
                                delta_range, delta_clip, random_process2, custom_model_objects, target_model_update,
                                **kwargs)

    def compile(self, optimizer, metrics=[]):
        self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics))
        self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics))

    def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
            visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
            nb_max_episode_steps=None):

        if not (self.agent1.compiled and self.agent2.compiled):
            raise RuntimeError(
                'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
        if action_repetition < 1:
            raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))

        self.agent1.training = True
        self.agent2.training = True

        callbacks = [] if not callbacks else callbacks[:]

        if verbose == 1:
            callbacks += [TrainIntervalLogger(interval=log_interval)]
        elif verbose > 1:
            callbacks += [TrainEpisodeLogger()]
        if visualize:
            callbacks += [Visualizer()]
        history = History()
        callbacks += [history]
        callbacks = CallbackList(callbacks)
        if hasattr(callbacks, 'set_model'):
            callbacks.set_model(self)
        else:
            callbacks._set_model(self)
        callbacks._set_env(env)
        params = {
            'nb_steps': nb_steps,
        }
        if hasattr(callbacks, 'set_params'):
            callbacks.set_params(params)
        else:
            callbacks._set_params(params)
        self.agent1._on_train_begin()
        self.agent2._on_train_begin()
        callbacks.on_train_begin()

        episode = np.int16(0)
        self.agent1.step = np.int16(0)
        self.agent2.step = np.int16(0)
        observation = None
        episode_reward1 = None
        episode_reward2 = None
        episode_step = None
        did_abort = False
        try:
            while self.agent1.step < nb_steps:  # not individual for now
                if observation is None:  # start of a new episode
                    callbacks.on_episode_begin(episode)
                    episode_step = np.int16(0)
                    episode_reward1 = np.float32(0)
                    episode_reward2 = np.float32(0)

                    # Obtain the initial observation by resetting the environment.
                    self.agent1.reset_states()
                    self.agent2.reset_states()
                    observation = deepcopy(env.reset())
                    if self.agent1.processor is not None:  # not individual for now
                        observation = self.agent1.processor.process_observation(observation)
                    assert observation is not None

                    # Perform random starts at beginning of episode and do not record them into the experience.
                    # This slightly changes the start position between games.
                    nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
                    for _ in range(nb_random_start_steps):
                        if start_step_policy is None:
                            action = env.action_space.sample()
                        else:
                            action = start_step_policy(observation)
                        if self.agent1.processor is not None:  # not individual for now. action is not from agent anyway
                            action = self.agent1.processor.process_action(action)
                        callbacks.on_action_begin(action)
                        observation, reward, done, info = env.step(action)
                        observation = deepcopy(observation)
                        if self.agent1.processor is not None:
                            observation, reward, done, info = self.agent1.processor.process_step(observation, reward,
                                                                                                 done, info)
                        callbacks.on_action_end(action)
                        if done:
                            warnings.warn(
                                'Env ended before {} random steps could be performed at the start. '
                                'You should probably lower the `nb_max_start_steps` parameter.'.format(
                                    nb_random_start_steps))
                            observation = deepcopy(env.reset())
                            if self.agent1.processor is not None:
                                observation = self.agent1.processor.process_observation(observation)
                            break

                # At this point, we expect to be fully initialized.
                assert episode_reward1 is not None
                assert episode_reward2 is not None
                assert episode_step is not None
                assert observation is not None

                # Run a single step.
                callbacks.on_step_begin(episode_step)
                # This is were all of the work happens. We first perceive and compute the action
                # (forward step) and then use the reward to improve (backward step).
                action1 = self.agent1.forward(observation)
                action2 = self.agent2.forward(observation)
                if self.agent1.processor is not None:
                    action1 = self.agent1.processor.process_action(action1)
                if self.agent2.processor is not None:
                    action2 = self.agent2.processor.process_action(action2)
                action = (np.ndarray.item(action1), np.ndarray.item(action2))
                reward1 = np.float32(0)
                reward2 = np.float32(0)
                reward = np.float32(0)
                accumulated_info = {}
                done = False
                for _ in range(action_repetition):
                    callbacks.on_action_begin(action)  # Use only one of the actions? added actions?
                    observation, r, done, info = env.step(action)
                    observation = deepcopy(observation)
                    if self.agent1.processor is not None:
                        observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info)
                    for key, value in info.items():
                        if not np.isreal(value):
                            continue
                        if key not in accumulated_info:
                            accumulated_info[key] = np.zeros_like(value)
                        accumulated_info[key] += value
                    callbacks.on_action_end(action)
                    reward1 += info["r1"]
                    reward2 += info["r2"]
                    reward += info["r1"] + info["r2"]
                    if done:
                        break
                if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
                    # Force a terminal state.
                    done = True
                metrics1 = self.agent1.backward(reward1, terminal=done)
                metrics2 = self.agent2.backward(reward2, terminal=done)
                episode_reward1 += reward1
                episode_reward2 += reward2

                step_logs = {
                    'action': action,
                    'observation': observation,
                    'reward': reward,
                    'metrics': metrics1,  # not individual for now
                    'episode': episode,
                    'info': accumulated_info,
                }
                callbacks.on_step_end(episode_step, step_logs)
                episode_step += 1
                self.agent1.step += 1
                self.agent2.step += 1

                if done:
                    # We are in a terminal state but the agent hasn't yet seen it. We therefore
                    # perform one more forward-backward call and simply ignore the action before
                    # resetting the environment. We need to pass in `terminal=False` here since
                    # the *next* state, that is the state of the newly reset environment, is
                    # always non-terminal by convention.
                    self.agent1.forward(observation)
                    self.agent2.forward(observation)
                    self.agent1.backward(0., terminal=False)
                    self.agent2.backward(0., terminal=False)

                    # This episode is finished, report and reset.
                    episode_logs = {
                        'episode_reward': episode_reward1 + episode_reward2,
                        'nb_episode_steps': episode_step,
                        'nb_steps': self.agent1.step,  # not individual for now
                    }
                    callbacks.on_episode_end(episode, episode_logs)

                    episode += 1
                    observation = None
                    episode_step = None
                    episode_reward1 = None
                    episode_reward2 = None
        except KeyboardInterrupt:
            # We catch keyboard interrupts here so that training can be be safely aborted.
            # This is so common that we've built this right into this function, which ensures that
            # the `on_train_end` method is properly called.
            did_abort = True
        callbacks.on_train_end(logs={'did_abort': did_abort})
        self.agent1._on_train_end()
        self.agent2._on_train_end()


        return history
示例#15
0
x = Activation('tanh')(x)

critic = Model(inputs=[action_input, observation_input], outputs=x)

print(critic.summary())

#######################CRITIC-------END######################################

# Create the ddpg agent using the models that are defined above, define learning rate for target networks within and the discount factor, gamma
# gamma = 0.9 is the discount factor of future rewards
ddpg = DDPGAgent(nb_actions=nb_actions,
                 actor=actor,
                 critic=critic,
                 critic_action_input=action_input,
                 memory=memory,
                 batch_size=32,
                 nb_steps_warmup_critic=5000,
                 nb_steps_warmup_actor=5000,
                 random_process=random_process,
                 gamma=0.9,
                 target_model_update=5e-3)

# .compile() is used to configure the model with losses and metrics.
# The learning rate of actor and critic are entered as arguments below respectfully
ddpg.compile([Adam(lr=5e-4), Adam(lr=5e-3)], metrics=['mae'])

# show the metrics of the model that can be analysed in graphs
print(ddpg.metrics_names)

# .fit() is used to train the DDPG model
# 3000 max steps specified by Christos Kouppas
示例#16
0
x = Concatenate()([action_input, flattened_observation])
x = Dense(1, activation='linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

agent.fit(train_env,
          nb_steps=1000,
          visualize=False,
          verbose=2,
          nb_max_episode_steps=100)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format("abc"), overwrite=True)
示例#17
0
def train_agent(env, args):
    from src.Agents import create_ddpg_actor, create_ddpg_critic, ddpg_controls, EnvironmentWrapper
    from keras.optimizers import Adam

    from rl.agents.ddpg import DDPGAgent
    from rl.policy import EpsGreedyQPolicy
    from rl.memory import SequentialMemory
    from rl.random import OrnsteinUhlenbeckProcess

    env = EnvironmentWrapper(ddpg_controls, env)

    nb_actions = 3
    actor = create_ddpg_actor(env)
    critic, action_input = create_ddpg_critic(env)

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=50000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=2000,
                      nb_steps_warmup_actor=2000,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)
    agent.compile(Adam(lr=0.5e-2, clipnorm=1.), metrics=['mae'])

    try:
        agent.load_weights(args.ai_in)
    except OSError:
        pass

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.
    agent.fit(env, nb_steps=20000, visualize=False, verbose=2)

    # After training is done, we save the final weights.
    agent.save_weights(args.ai_out, overwrite=True)

    # Finally, evaluate our algorithm for 5 episodes.
    agent.test(env, nb_episodes=1, visualize=False)
示例#18
0
    env = populate_env(gym.make("MountainCarContinuous-v0"))

    # Build the actor and the critic
    actor = simple_actor(env)
    critic = simple_critic(env)

    # Memory
    memory = SimpleMemory(env=env, limit=1000000)

    # Noise
    random_process = OrnsteinUhlenbeckProcess(size=env.action_space.dim,
                                              theta=.15,
                                              mu=0.,
                                              sigma=3.)

    # Agent
    agent = DDPGAgent(experiment=experiment,
                      actor=actor,
                      critic=critic,
                      env=env,
                      memory=memory,
                      random_process=random_process)
    agent.compile()

    agent.train(env=env,
                episodes=10,
                render=True,
                verbosity=2,
                nb_max_episode_steps=1000,
                plots=False)
示例#19
0
# agent
# With a model, memory, and policy defined, we’re now ready to create a deep Q network Agent and send that agent those objects.
# Keras-RL provides an agent class called DDPG Agent that we can use for this, as shown in the following code:

# nb_steps_warmup: Determines how long we wait before we start doing experience replay, which if you recall, is when we actually start training the network.
# This lets us build up enough experience to build a proper minibatch.
# If you choose a value for this parameter that’s smaller than your batch size, Keras RL will sample with a replacement.

# target_model_update: The Q function is recursive and when the agent updates it’s network for Q(s,a) that update also impacts the prediction it will make for
# Q(s’, a). This can make for a very unstable network. The way most deep Q network implementations address this limitation is by using a target network, which
# is a copy of the deep Q network that isn’t trained, but rather replaced with a fresh copy every so often. The target_model_update parameter controls how often this happens.
ddpg = DDPGAgent(nb_actions=num_actions,
                 actor=model,
                 critic=critic,
                 critic_action_input=critic_action_input,
                 memory=memory,
                 nb_steps_warmup_critic=100,
                 nb_steps_warmup_actor=100,
                 random_process=random_process,
                 gamma=.99,
                 target_model_update=1e-3)
ddpg.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mae'])

ddpg.fit(env,
         nb_steps=50000,
         visualize=True,
         verbose=1,
         nb_max_episode_steps=200)

ddpg.test(env, nb_episodes=5, visualize=True)