def compile(self, optimizer, metrics=[]):
        metrics += [mean_q]  # register default metrics

        def clipped_masked_error(args):
            y_true, y_pred, mask = args
            loss = huber_loss(y_true, y_pred, self.delta_clip)
            loss *= mask  # apply element-wise mask
            return K.sum(loss, axis=-1)

        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.model.output
        y_true = Input(name='y_true', shape=(self.nb_actions,))
        mask = Input(name='mask', shape=(self.nb_actions,))
        loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask])
        ins = [self.model.input] if type(self.model.input) is not list else self.model.input
        trainable_model = Model(input=ins + [y_true, mask], output=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
        ]
        trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics)
        self.trainable_model = trainable_model

        self.compiled = True
示例#2
0
def test_naf_layer_diag():
    batch_size = 2
    for nb_actions in (1, 3):
        # Construct single model with NAF as the only layer, hence it is fully deterministic
        # since no weights are used, which would be randomly initialized.
        L_flat_input = Input(shape=(nb_actions, ))
        mu_input = Input(shape=(nb_actions, ))
        action_input = Input(shape=(nb_actions, ))
        x = NAFLayer(nb_actions,
                     mode='diag')([L_flat_input, mu_input, action_input])
        model = Model(input=[L_flat_input, mu_input, action_input], output=x)
        model.compile(loss='mse', optimizer='sgd')

        # Create random test data.
        L_flat = np.random.random((batch_size, nb_actions)).astype('float32')
        mu = np.random.random((batch_size, nb_actions)).astype('float32')
        action = np.random.random((batch_size, nb_actions)).astype('float32')

        # Perform reference computations in numpy since these are much easier to verify.
        P = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32')
        for p, l_flat in zip(P, L_flat):
            p[np.diag_indices(nb_actions)] = l_flat
        print(P, L_flat)
        A_ref = np.array([
            np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)
        ]).astype('float32')
        A_ref *= -.5

        # Finally, compute the output of the net, which should be identical to the previously
        # computed reference.
        A_net = model.predict([L_flat, mu, action]).flatten()
        assert_allclose(A_net, A_ref, rtol=1e-5)
    def compile(self, optimizer, metrics=[]):
        metrics += [mean_q]  # register default metrics

        # Create target V model. We don't need targets for mu or L.
        self.target_V_model = clone_model(self.V_model, self.custom_model_objects)
        self.target_V_model.compile(optimizer='sgd', loss='mse')

        # Build combined model.
        a_in = Input(shape=(self.nb_actions,), name='action_input')
        if type(self.V_model.input) is list:
            observation_shapes = [i._keras_shape[1:] for i in self.V_model.input]
        else:
            observation_shapes = [self.V_model.input._keras_shape[1:]]
        os_in = [Input(shape=shape, name='observation_input_{}'.format(idx)) for idx, shape in enumerate(observation_shapes)]
        L_out = self.L_model([a_in] + os_in)
        V_out = self.V_model(os_in)

        mu_out = self.mu_model(os_in)
        A_out = NAFLayer(self.nb_actions, mode=self.covariance_mode)([L_out, mu_out, a_in])
        combined_out = Lambda(lambda x: x[0]+x[1], output_shape=lambda x: x[0])([A_out, V_out])
        combined = Model(input=[a_in] + os_in, output=[combined_out])
        # Compile combined model.
        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            updates = get_soft_target_model_updates(self.target_V_model, self.V_model, self.target_model_update)
            optimizer = AdditionalUpdatesOptimizer(optimizer, updates)

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        combined.compile(loss=clipped_error, optimizer=optimizer, metrics=metrics)
        self.combined_model = combined

        self.compiled = True
示例#4
0
    def compile(self, optimizer, metrics=None):

        metrics = []

        # We never train the target model, hence we can set the optimizer and loss arbitrarily.
        self.target_model = clone_model(self.model, self.custom_model_objects)
        self.target_model.compile(optimizer='sgd', loss='mse')
        self.model.compile(optimizer='sgd', loss='mse')

        # Compile model.
        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            updates = get_soft_target_model_updates(self.target_model,
                                                    self.model,
                                                    self.target_model_update)
            optimizer = AdditionalUpdatesOptimizer(optimizer, updates)

        def clipped_masked_error(args):
            y_true, y_pred, mask = args
            loss = huber_loss(y_true, y_pred, self.delta_clip)
            loss *= mask  # apply element-wise mask
            return K.sum(loss, axis=-1)

        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.model.output
        y_true = Input(name='y_true', shape=(self.nb_actions, ))
        mask = Input(name='mask', shape=(self.nb_actions, ))
        loss_out = Lambda(clipped_masked_error,
                          output_shape=(1, ),
                          name='loss')([y_pred, y_true, mask])
        ins = [self.model.input] if type(
            self.model.input) is not list else self.model.input
        trainable_model = Model(input=ins + [y_true, mask],
                                output=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(
                y_pred),  # we only include this for the metrics
        ]
        trainable_model.compile(optimizer=optimizer,
                                loss=losses,
                                metrics=combined_metrics)
        self.trainable_model = trainable_model

        self.compiled = True
示例#5
0
def test_single_continuous_dqn_input():
    nb_actions = 2

    V_model = Sequential()
    V_model.add(Flatten(input_shape=(2, 3)))
    V_model.add(Dense(1))

    mu_model = Sequential()
    mu_model.add(Flatten(input_shape=(2, 3)))
    mu_model.add(Dense(nb_actions))

    L_input = Input(shape=(2, 3))
    L_input_action = Input(shape=(nb_actions, ))
    x = concatenate([Flatten()(L_input), L_input_action])
    x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
    L_model = Model(input=[L_input_action, L_input], output=x)

    memory = SequentialMemory(limit=10, window_length=2)
    agent = NAFAgent(nb_actions=nb_actions,
                     V_model=V_model,
                     L_model=L_model,
                     mu_model=mu_model,
                     memory=memory,
                     nb_steps_warmup=5,
                     batch_size=4)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv((3, )), nb_steps=10)
示例#6
0
    def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False,
                 dueling_type='avg', *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__len__') and len(model.output) > 1:
            raise ValueError('Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model))
        if model.output._keras_shape != (None, self.nb_actions):
            raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions))

        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type
        if self.enable_dueling_network:
            # get the second last layer of the model, abandon the last layer
            layer = model.layers[-2]
            nb_action = model.output._keras_shape[-1]
            # layer y has a shape (nb_action+1,)
            # y[:,0] represents V(s;theta)
            # y[:,1:] represents A(s,a;theta)
            y = Dense(nb_action + 1, activation='linear')(layer.output)
            # caculate the Q(s,a;theta)
            # dueling_type == 'avg'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            # dueling_type == 'max'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            # dueling_type == 'naive'
            # Q(s,a;theta) = V(s;theta) + A(s,a;theta)
            if self.dueling_type == 'avg':
                outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], keepdims=True), output_shape=(nb_action,))(y)
            elif self.dueling_type == 'max':
                outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], keepdims=True), output_shape=(nb_action,))(y)
            elif self.dueling_type == 'naive':
                outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y)
            else:
                assert False, "dueling_type must be one of {'avg','max','naive'}"

            model = Model(input=model.input, output=outputlayer)

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy
        
        # State.
        self.reset_states()

        self.X = []
        self.Y = []
        self.x = []
示例#7
0
def test_cdqn():
    # TODO: replace this with a simpler environment where we can actually test if it finds a solution
    env = gym.make('Pendulum-v0')
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.shape[0]

    V_model = Sequential()
    V_model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    V_model.add(Dense(16))
    V_model.add(Activation('relu'))
    V_model.add(Dense(1))

    mu_model = Sequential()
    mu_model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    mu_model.add(Dense(16))
    mu_model.add(Activation('relu'))
    mu_model.add(Dense(nb_actions))

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    x = concatenate([action_input, Flatten()(observation_input)])
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
    L_model = Model(input=[action_input, observation_input], output=x)

    memory = SequentialMemory(limit=1000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                              mu=0.,
                                              sigma=.3,
                                              size=nb_actions)
    agent = NAFAgent(nb_actions=nb_actions,
                     V_model=V_model,
                     L_model=L_model,
                     mu_model=mu_model,
                     memory=memory,
                     nb_steps_warmup=50,
                     random_process=random_process,
                     gamma=.99,
                     target_model_update=1e-3)
    agent.compile(Adam(lr=1e-3))

    agent.fit(env,
              nb_steps=400,
              visualize=False,
              verbose=0,
              nb_max_episode_steps=100)
    h = agent.test(env,
                   nb_episodes=2,
                   visualize=False,
                   nb_max_episode_steps=100)
示例#8
0
def test_multi_continuous_dqn_input():
    nb_actions = 2

    V_input1 = Input(shape=(2, 3))
    V_input2 = Input(shape=(2, 4))
    x = concatenate([V_input1, V_input2])
    x = Flatten()(x)
    x = Dense(1)(x)
    V_model = Model(input=[V_input1, V_input2], output=x)

    mu_input1 = Input(shape=(2, 3))
    mu_input2 = Input(shape=(2, 4))
    x = concatenate([mu_input1, mu_input2])
    x = Flatten()(x)
    x = Dense(nb_actions)(x)
    mu_model = Model(input=[mu_input1, mu_input2], output=x)

    L_input1 = Input(shape=(2, 3))
    L_input2 = Input(shape=(2, 4))
    L_input_action = Input(shape=(nb_actions, ))
    x = concatenate([L_input1, L_input2])
    x = concatenate([Flatten()(x), L_input_action])
    x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
    L_model = Model(input=[L_input_action, L_input1, L_input2], output=x)

    memory = SequentialMemory(limit=10, window_length=2)
    processor = MultiInputProcessor(nb_inputs=2)
    agent = NAFAgent(nb_actions=nb_actions,
                     V_model=V_model,
                     L_model=L_model,
                     mu_model=mu_model,
                     memory=memory,
                     nb_steps_warmup=5,
                     batch_size=4,
                     processor=processor)
    agent.compile('sgd')
    agent.fit(MultiInputTestEnv([(3, ), (4, )]), nb_steps=10)
示例#9
0
def test_ddpg():
    # TODO: replace this with a simpler environment where we can actually test if it finds a solution
    env = gym.make('Pendulum-v0')
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.shape[0]

    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(input=[action_input, observation_input], output=x)

    memory = SequentialMemory(limit=1000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=50,
                      nb_steps_warmup_actor=50,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)
    agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)])

    agent.fit(env,
              nb_steps=400,
              visualize=False,
              verbose=0,
              nb_max_episode_steps=100)
    h = agent.test(env,
                   nb_episodes=2,
                   visualize=False,
                   nb_max_episode_steps=100)
示例#10
0
def build_model(env, config: Config):
    """

    :param gym.wrappers.time_limit.TimeLimit env:
    :return:
    """
    in_x = x = Input(shape=(config.window_length, ) +
                     env.observation_space.shape)
    x = Flatten()(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(16, activation="relu")(x)
    x = Dense(env.action_space.n, activation="linear")(x)
    model = Model(input=in_x, output=x)
    return model
示例#11
0
def build_model(env, config):
    """

    :param Config config:
    :param gym.core.Env env:
    :return:
    """
    n_dims = [64, 32]

    in_x = x = Input(shape=(config.window_length, ) +
                     env.observation_space.shape)
    x = Flatten()(x)
    for n_dim in n_dims:
        x = Dense(n_dim, activation="relu")(x)
    x = Dense(env.action_space.n, activation="linear")(x)
    model = Model(input=in_x, output=x)
    return model
示例#12
0
def build_model(env, config):
    """

    :param Config config:
    :param gym.core.Env env:
    :return:
    """
    n_dim = 64
    n_layer = 2

    in_x = x = Input(shape=(config.window_length, ) +
                     env.observation_space.shape)
    x = Flatten()(x)
    x = Dense(n_dim, activation="relu")(x)
    for _ in range(n_layer):
        x = add_resnet(x, n_dim)
    x = Dense(env.action_space.n, activation="linear")(x)
    model = Model(input=in_x, output=x)
    return model
示例#13
0
def test_multi_dqn_input():
    input1 = Input(shape=(2, 3))
    input2 = Input(shape=(2, 4))
    x = merge([input1, input2], mode='concat')
    x = Flatten()(x)
    x = Dense(2)(x)
    model = Model(input=[input1, input2], output=x)

    memory = SequentialMemory(limit=10, window_length=2)
    processor = MultiInputProcessor(nb_inputs=2)
    for double_dqn in (True, False):
        agent = DQNAgent(model,
                         memory=memory,
                         nb_actions=2,
                         nb_steps_warmup=5,
                         batch_size=4,
                         processor=processor,
                         enable_double_dqn=double_dqn)
        agent.compile('sgd')
        agent.fit(MultiInputTestEnv([(3, ), (4, )]), nb_steps=10)
mu_model.add(Dense(nb_actions))
mu_model.add(Activation('linear'))
print(mu_model.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
x = concatenate([action_input, Flatten()(observation_input)])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(((nb_actions * nb_actions + nb_actions) / 2))(x)
x = Activation('linear')(x)
L_model = Model(input=[action_input, observation_input], output=x)
print(L_model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
processor = PendulumProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
                 memory=memory, nb_steps_warmup=100, random_process=random_process,
                 gamma=.99, target_model_update=1e-3, processor=processor)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.