def test_sample(self): inputs = get_inputs(seq_len=512) embed_layer = get_embedding(inputs, token_num=12, embed_dim=768, pos_num=512) model = keras.models.Model(inputs=inputs, outputs=embed_layer) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary(line_length=120) self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
def test_sample(self): inputs = get_inputs(seq_len=512) embed_layer, _ = get_embedding(inputs, token_num=12, embed_dim=768, pos_num=512) masked_layer = Masked(name='Masked')([embed_layer, inputs[-1]]) model = keras.models.Model(inputs=inputs, outputs=masked_layer) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() model.predict([ np.asarray([[1] + [0] * 511]), np.asarray([[0] * 512]), np.asarray([[1] + [0] * 511]), ]) self.assertEqual((None, 512, 768), model.layers[-1].output_shape)
def get_checkpoint_model(token_num, pos_num=512, seq_len=512, embed_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, finetuned=False, output_dim=2, trainable=None, output_layer_num=1, retention_configuration=None, LAMBDA=None, FLAG_EXTRACT_LAYER=None, TASK=None, ): """Get BERT model. :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param trainable: Whether the model is trainable. :param output_layer_num: The number of layers whose outputs will be concatenated as a single output. Only available when `training` is `False`. :return: The built model. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable inputs = get_inputs(seq_len=seq_len) attention_mask = inputs[2] embed_layer, embed_weights = get_embedding( inputs, token_num=token_num, embed_dim=embed_dim, pos_num=pos_num, dropout_rate=dropout_rate, ) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) transformed = get_encoders( encoder_num=transformer_num, input_layer=embed_layer, head_num=head_num, hidden_dim=feed_forward_dim, attention_activation=attention_activation, feed_forward_activation=feed_forward_activation, dropout_rate=dropout_rate, attention_mask=attention_mask, SEQ_LEN=seq_len, retention_configuration=retention_configuration, LAMBDA=LAMBDA, FLAG_EXTRACT_LAYER=FLAG_EXTRACT_LAYER, ) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=embed_dim, activation='tanh', name='NSP-Dense', )(extract_layer) if TASK == 'sts-b': nsp_pred_layer = keras.layers.Dense( units=output_dim, name='NSP', )(nsp_dense_layer) else: nsp_pred_layer = keras.layers.Dense( units=output_dim, activation='softmax', name='NSP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=nsp_pred_layer) for layer in model.layers: layer.trainable = _trainable(layer) return model