def compile(self, optimizer, metrics=[]): metrics += [mean_q] # register default metrics def clipped_masked_error(args): y_true, y_pred, mask = args loss = huber_loss(y_true, y_pred, self.delta_clip) loss *= mask # apply element-wise mask return K.sum(loss, axis=-1) # Create trainable model. The problem is that we need to mask the output since we only # ever want to update the Q values for a certain action. The way we achieve this is by # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility # to mask out certain parameters by passing in multiple inputs to the Lambda layer. y_pred = self.model.output y_true = Input(name='y_true', shape=(self.nb_actions,)) mask = Input(name='mask', shape=(self.nb_actions,)) loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask]) ins = [self.model.input] if type(self.model.input) is not list else self.model.input trainable_model = Model(input=ins + [y_true, mask], output=[loss_out, y_pred]) assert len(trainable_model.output_names) == 2 combined_metrics = {trainable_model.output_names[1]: metrics} losses = [ lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics ] trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics) self.trainable_model = trainable_model self.compiled = True
def test_naf_layer_diag(): batch_size = 2 for nb_actions in (1, 3): # Construct single model with NAF as the only layer, hence it is fully deterministic # since no weights are used, which would be randomly initialized. L_flat_input = Input(shape=(nb_actions, )) mu_input = Input(shape=(nb_actions, )) action_input = Input(shape=(nb_actions, )) x = NAFLayer(nb_actions, mode='diag')([L_flat_input, mu_input, action_input]) model = Model(input=[L_flat_input, mu_input, action_input], output=x) model.compile(loss='mse', optimizer='sgd') # Create random test data. L_flat = np.random.random((batch_size, nb_actions)).astype('float32') mu = np.random.random((batch_size, nb_actions)).astype('float32') action = np.random.random((batch_size, nb_actions)).astype('float32') # Perform reference computations in numpy since these are much easier to verify. P = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32') for p, l_flat in zip(P, L_flat): p[np.diag_indices(nb_actions)] = l_flat print(P, L_flat) A_ref = np.array([ np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P) ]).astype('float32') A_ref *= -.5 # Finally, compute the output of the net, which should be identical to the previously # computed reference. A_net = model.predict([L_flat, mu, action]).flatten() assert_allclose(A_net, A_ref, rtol=1e-5)
def compile(self, optimizer, metrics=[]): metrics += [mean_q] # register default metrics # Create target V model. We don't need targets for mu or L. self.target_V_model = clone_model(self.V_model, self.custom_model_objects) self.target_V_model.compile(optimizer='sgd', loss='mse') # Build combined model. a_in = Input(shape=(self.nb_actions,), name='action_input') if type(self.V_model.input) is list: observation_shapes = [i._keras_shape[1:] for i in self.V_model.input] else: observation_shapes = [self.V_model.input._keras_shape[1:]] os_in = [Input(shape=shape, name='observation_input_{}'.format(idx)) for idx, shape in enumerate(observation_shapes)] L_out = self.L_model([a_in] + os_in) V_out = self.V_model(os_in) mu_out = self.mu_model(os_in) A_out = NAFLayer(self.nb_actions, mode=self.covariance_mode)([L_out, mu_out, a_in]) combined_out = Lambda(lambda x: x[0]+x[1], output_shape=lambda x: x[0])([A_out, V_out]) combined = Model(input=[a_in] + os_in, output=[combined_out]) # Compile combined model. if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. updates = get_soft_target_model_updates(self.target_V_model, self.V_model, self.target_model_update) optimizer = AdditionalUpdatesOptimizer(optimizer, updates) def clipped_error(y_true, y_pred): return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) combined.compile(loss=clipped_error, optimizer=optimizer, metrics=metrics) self.combined_model = combined self.compiled = True
def compile(self, optimizer, metrics=None): metrics = [] # We never train the target model, hence we can set the optimizer and loss arbitrarily. self.target_model = clone_model(self.model, self.custom_model_objects) self.target_model.compile(optimizer='sgd', loss='mse') self.model.compile(optimizer='sgd', loss='mse') # Compile model. if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. updates = get_soft_target_model_updates(self.target_model, self.model, self.target_model_update) optimizer = AdditionalUpdatesOptimizer(optimizer, updates) def clipped_masked_error(args): y_true, y_pred, mask = args loss = huber_loss(y_true, y_pred, self.delta_clip) loss *= mask # apply element-wise mask return K.sum(loss, axis=-1) # Create trainable model. The problem is that we need to mask the output since we only # ever want to update the Q values for a certain action. The way we achieve this is by # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility # to mask out certain parameters by passing in multiple inputs to the Lambda layer. y_pred = self.model.output y_true = Input(name='y_true', shape=(self.nb_actions, )) mask = Input(name='mask', shape=(self.nb_actions, )) loss_out = Lambda(clipped_masked_error, output_shape=(1, ), name='loss')([y_pred, y_true, mask]) ins = [self.model.input] if type( self.model.input) is not list else self.model.input trainable_model = Model(input=ins + [y_true, mask], output=[loss_out, y_pred]) assert len(trainable_model.output_names) == 2 combined_metrics = {trainable_model.output_names[1]: metrics} losses = [ lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer lambda y_true, y_pred: K.zeros_like( y_pred), # we only include this for the metrics ] trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics) self.trainable_model = trainable_model self.compiled = True
def test_single_continuous_dqn_input(): nb_actions = 2 V_model = Sequential() V_model.add(Flatten(input_shape=(2, 3))) V_model.add(Dense(1)) mu_model = Sequential() mu_model.add(Flatten(input_shape=(2, 3))) mu_model.add(Dense(nb_actions)) L_input = Input(shape=(2, 3)) L_input_action = Input(shape=(nb_actions, )) x = concatenate([Flatten()(L_input), L_input_action]) x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) L_model = Model(input=[L_input_action, L_input], output=x) memory = SequentialMemory(limit=10, window_length=2) agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, memory=memory, nb_steps_warmup=5, batch_size=4) agent.compile('sgd') agent.fit(MultiInputTestEnv((3, )), nb_steps=10)
def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg', *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) # Validate (important) input. if hasattr(model.output, '__len__') and len(model.output) > 1: raise ValueError('Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model)) if model.output._keras_shape != (None, self.nb_actions): raise ValueError('Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions)) # Parameters. self.enable_double_dqn = enable_double_dqn self.enable_dueling_network = enable_dueling_network self.dueling_type = dueling_type if self.enable_dueling_network: # get the second last layer of the model, abandon the last layer layer = model.layers[-2] nb_action = model.output._keras_shape[-1] # layer y has a shape (nb_action+1,) # y[:,0] represents V(s;theta) # y[:,1:] represents A(s,a;theta) y = Dense(nb_action + 1, activation='linear')(layer.output) # caculate the Q(s,a;theta) # dueling_type == 'avg' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) # dueling_type == 'max' # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) # dueling_type == 'naive' # Q(s,a;theta) = V(s;theta) + A(s,a;theta) if self.dueling_type == 'avg': outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], keepdims=True), output_shape=(nb_action,))(y) elif self.dueling_type == 'max': outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], keepdims=True), output_shape=(nb_action,))(y) elif self.dueling_type == 'naive': outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y) else: assert False, "dueling_type must be one of {'avg','max','naive'}" model = Model(input=model.input, output=outputlayer) # Related objects. self.model = model if policy is None: policy = EpsGreedyQPolicy() if test_policy is None: test_policy = GreedyQPolicy() self.policy = policy self.test_policy = test_policy # State. self.reset_states() self.X = [] self.Y = [] self.x = []
def test_cdqn(): # TODO: replace this with a simpler environment where we can actually test if it finds a solution env = gym.make('Pendulum-v0') np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.shape[0] V_model = Sequential() V_model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) V_model.add(Dense(16)) V_model.add(Activation('relu')) V_model.add(Dense(1)) mu_model = Sequential() mu_model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) mu_model.add(Dense(16)) mu_model.add(Activation('relu')) mu_model.add(Dense(nb_actions)) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x = concatenate([action_input, Flatten()(observation_input)]) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) L_model = Model(input=[action_input, observation_input], output=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, memory=memory, nb_steps_warmup=50, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=1e-3)) agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
def test_multi_continuous_dqn_input(): nb_actions = 2 V_input1 = Input(shape=(2, 3)) V_input2 = Input(shape=(2, 4)) x = concatenate([V_input1, V_input2]) x = Flatten()(x) x = Dense(1)(x) V_model = Model(input=[V_input1, V_input2], output=x) mu_input1 = Input(shape=(2, 3)) mu_input2 = Input(shape=(2, 4)) x = concatenate([mu_input1, mu_input2]) x = Flatten()(x) x = Dense(nb_actions)(x) mu_model = Model(input=[mu_input1, mu_input2], output=x) L_input1 = Input(shape=(2, 3)) L_input2 = Input(shape=(2, 4)) L_input_action = Input(shape=(nb_actions, )) x = concatenate([L_input1, L_input2]) x = concatenate([Flatten()(x), L_input_action]) x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) L_model = Model(input=[L_input_action, L_input1, L_input2], output=x) memory = SequentialMemory(limit=10, window_length=2) processor = MultiInputProcessor(nb_inputs=2) agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, memory=memory, nb_steps_warmup=5, batch_size=4, processor=processor) agent.compile('sgd') agent.fit(MultiInputTestEnv([(3, ), (4, )]), nb_steps=10)
def test_ddpg(): # TODO: replace this with a simpler environment where we can actually test if it finds a solution env = gym.make('Pendulum-v0') np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.shape[0] actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
def build_model(env, config: Config): """ :param gym.wrappers.time_limit.TimeLimit env: :return: """ in_x = x = Input(shape=(config.window_length, ) + env.observation_space.shape) x = Flatten()(x) x = Dense(16, activation="relu")(x) x = Dense(16, activation="relu")(x) x = Dense(16, activation="relu")(x) x = Dense(16, activation="relu")(x) x = Dense(env.action_space.n, activation="linear")(x) model = Model(input=in_x, output=x) return model
def build_model(env, config): """ :param Config config: :param gym.core.Env env: :return: """ n_dims = [64, 32] in_x = x = Input(shape=(config.window_length, ) + env.observation_space.shape) x = Flatten()(x) for n_dim in n_dims: x = Dense(n_dim, activation="relu")(x) x = Dense(env.action_space.n, activation="linear")(x) model = Model(input=in_x, output=x) return model
def build_model(env, config): """ :param Config config: :param gym.core.Env env: :return: """ n_dim = 64 n_layer = 2 in_x = x = Input(shape=(config.window_length, ) + env.observation_space.shape) x = Flatten()(x) x = Dense(n_dim, activation="relu")(x) for _ in range(n_layer): x = add_resnet(x, n_dim) x = Dense(env.action_space.n, activation="linear")(x) model = Model(input=in_x, output=x) return model
def test_multi_dqn_input(): input1 = Input(shape=(2, 3)) input2 = Input(shape=(2, 4)) x = merge([input1, input2], mode='concat') x = Flatten()(x) x = Dense(2)(x) model = Model(input=[input1, input2], output=x) memory = SequentialMemory(limit=10, window_length=2) processor = MultiInputProcessor(nb_inputs=2) for double_dqn in (True, False): agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, processor=processor, enable_double_dqn=double_dqn) agent.compile('sgd') agent.fit(MultiInputTestEnv([(3, ), (4, )]), nb_steps=10)
mu_model.add(Dense(nb_actions)) mu_model.add(Activation('linear')) print(mu_model.summary()) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') x = concatenate([action_input, Flatten()(observation_input)]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(((nb_actions * nb_actions + nb_actions) / 2))(x) x = Activation('linear')(x) L_model = Model(input=[action_input, observation_input], output=x) print(L_model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = PendulumProcessor() memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, memory=memory, nb_steps_warmup=100, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=processor) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C.