def _build_model(graph, state_size, skip_frames, action_size, learning_rate): import keras ATARI_SHAPE = (state_size[0], state_size[1], skip_frames ) # input image size to model ACTION_SIZE = action_size # With the functional API we need to define the inputs. frames_input = layers.Input(ATARI_SHAPE, name='frames') #actions_input = layers.Input((ACTION_SIZE,), name='action_mask') # Assuming that the input frames are still encoded from 0 to 255. Transforming to [0, 1]. normalized = layers.Lambda(lambda x: x / 255.0, name='normalization')(frames_input) # "The first hidden layer convolves 16 8×8 filters with stride 4 with the input image and applies a rectifier nonlinearity." conv_1 = layers.convolutional.Conv2D( 16, (8, 8), strides=(4, 4), activation='relu', kernel_initializer='random_uniform', bias_initializer='random_uniform')(normalized) # "The second hidden layer convolves 32 4×4 filters with stride 2, again followed by a rectifier nonlinearity." conv_2 = layers.convolutional.Conv2D( 32, (4, 4), strides=(2, 2), activation='relu', kernel_initializer='random_uniform', bias_initializer='random_uniform')(conv_1) # Flattening the second convolutional layer. conv_flattened = layers.core.Flatten()(conv_2) # "The final hidden layer is fully-connected and consists of 256 rectifier units." shared = layers.Dense(256, activation='relu', kernel_initializer='random_uniform', bias_initializer='random_uniform')(conv_flattened) # "The output layer is a fully-connected linear layer with a single output for each valid action." output_actions = layers.Dense(ACTION_SIZE, activation='softmax', name='out1', kernel_initializer='random_uniform', bias_initializer='random_uniform')(shared) output_value = layers.Dense(1, name='out2', activation='linear', kernel_initializer='random_uniform', bias_initializer='random_uniform')(shared) keras.initializers.RandomUniform(minval=-0.5, maxval=0.5, seed=None) pmodel = Model(inputs=[frames_input], outputs=[output_actions, output_value]) rms = RMSprop(lr=learning_rate, rho=0.99, epsilon=0.1) #pmodel.compile(rms, loss={'out1':'categorical_crossentropy', 'out2':'mse'}) action_pl = K.placeholder(shape=(None, action_size)) advantages_pl = K.placeholder(shape=(None, )) discounted_r = K.placeholder(shape=(None, )) weighted_actions = K.max(action_pl * output_actions, axis=1, keepdims=True) eligibility = K.log(weighted_actions + 1e-10) * K.stop_gradient(advantages_pl) entropy = K.sum(output_actions * K.log(output_actions + 1e-10), axis=1, keepdims=True) ploss = 0.001 * entropy - eligibility closs = K.square(discounted_r - output_value) total_loss = K.mean(ploss + 0.5 * closs) input_tensors = pmodel.inputs + [action_pl] + [advantages_pl] + [ discounted_r ] + [K.learning_phase()] gradients = rms.get_gradients(total_loss, pmodel.trainable_weights) get_gradients = K.function(inputs=input_tensors, outputs=gradients) get_loss = K.function(inputs=input_tensors, outputs=[closs, ploss]) return (pmodel, get_gradients, get_loss)