def _feed_forward_builder(x): return FeedForward( units=hidden_dim, activation=activation, trainable=trainable, name=name, )(x)
def _wrap_layer(name, input_layer, build_func, dropout_rate=0.0, trainable=True, use_adapter=False, adapter_units=None, adapter_activation='relu'): build_output = build_func(input_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='%s-Dropout' % name, )(build_output) else: dropout_layer = build_output if isinstance(input_layer, list): input_layer = input_layer[0] if use_adapter: adapter = FeedForward( units=adapter_units, activation=adapter_activation, kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001), name='%s-Adapter' % name, )(dropout_layer) dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter]) add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer]) normal_layer = LayerNormalization( trainable=trainable, name='%s-Norm' % name, )(add_layer) return normal_layer
def build_model(emb_cid, emb_advid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) content = layers.Concatenate()([emb1, emb2]) mha = MultiHeadAttention(head_num=16)(content) mha = layers.Dropout(0.01)(mha) mha = layers.Add()([content, mha]) mha = LayerNormalization()(mha) mha = layers.Dropout(0.01)(mha) mha_ff = FeedForward(256)(mha) mha_out = layers.Add()([mha, mha_ff]) mha_out = LayerNormalization()(mha_out) lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(mha_out) avg_pool = layers.GlobalAveragePooling1D()(lstm) max_pool = layers.GlobalMaxPool1D()(lstm) x = layers.Concatenate()([avg_pool, max_pool]) x = layers.Dense(128, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dense(64, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dropout(0.1)(x) out = layers.Dense(10, activation='softmax')(x) model = keras.Model(inputs=[inp1, inp2], outputs=out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(1e-3), metrics=['accuracy']) return model
def test_sample(self): input_layer = keras.layers.Input( shape=(1, 3), name='Input', ) feed_forward_layer = FeedForward( units=4, activation=self._leaky_relu, weights=[ np.asarray([ [0.1, 0.2, 0.3, 0.4], [-0.1, 0.2, -0.3, 0.4], [0.1, -0.2, 0.3, -0.4], ]), np.asarray([ 0.0, -0.1, 0.2, -0.3, ]), np.asarray([ [0.1, 0.2, 0.3], [-0.1, 0.2, -0.3], [0.1, -0.2, 0.3], [-0.1, 0.2, 0.3], ]), np.asarray([ 0.0, 0.1, -0.2, ]), ], name='FeedForward', )(input_layer) model = keras.models.Model( inputs=input_layer, outputs=feed_forward_layer, ) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() inputs = np.array([[[0.2, 0.1, 0.3]]]) predict = model.predict(inputs) expected = np.asarray([[[0.0364, 0.0432, -0.0926]]]) self.assertTrue(np.allclose(expected, predict), predict)
def encoder(seq_len, m_features, d_model, n_heads, dff, rate=0.1, encoder=None): """Basic Attention Encoder. It can be concatenated with a previous encoder by passing it as argument.""" if encoder == None: in_seq = keras.layers.Input(shape=(seq_len, m_features)) in_seq = LayerNormalization()(in_seq) else:: in_seq = encoder.output linear = keras.layers.Dense(units=d_model)(norm_0) pos = TrigPosEmbedding(mode=TrigPosEmbedding.MODE_ADD)(linear) mha = MultiHeadAttention(head_num=n_heads)(pos) mha_drop = keras.layers.Dropout(rate=rate)(mha) add_1 = keras.layers.Add()([pos, mha_drop]) norm_1 = LayerNormalization()(add_1) ff = FeedForward(dff)(norm_1) ff_drop = keras.layers.Dropout(rate=rate)(ff) add_2 = keras.layers.Add()([ff_drop, norm_1]) out = LayerNormalization()(add_2) return keras.Model(in_seq, out) if encoder == None else keras.Model(encoder.input, out)
def _wrap_layer(name, input_layer, build_func, dropout_rate=0.0, trainable=True, use_adapter=False, adapter_units=None, adapter_activation='relu'): """Wrap layers with residual, normalization and dropout. :param name: Prefix of names for internal layers. :param input_layer: Input layer. :param build_func: A callable that takes the input tensor and generates the output tensor. :param dropout_rate: Dropout rate. :param trainable: Whether the layers are trainable. :param use_adapter: Whether to use feed-forward adapters before each residual connections. :param adapter_units: The dimension of the first transformation in feed-forward adapter. :param adapter_activation: The activation after the first transformation in feed-forward adapter. :return: Output layer. """ build_output = build_func(input_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='%s-Dropout' % name, )(build_output) else: dropout_layer = build_output if isinstance(input_layer, list): input_layer = input_layer[0] if use_adapter: adapter = FeedForward( units=adapter_units, activation=adapter_activation, kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001), name='%s-Adapter' % name, )(dropout_layer) dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter]) add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer]) normal_layer = LayerNormalization( trainable=trainable, name='%s-Norm' % name, )(add_layer) return normal_layer
def block(attention_input, head_num: int, feed_forward_units: int, dropout_rate: float) -> Tensor: attention_x = MultiHeadAttention( head_num=head_num, activation=None, use_bias=False, history_only=True, trainable=True, )(attention_input) attention_x = Dropout(dropout_rate)(attention_x) attention_x = Add()([attention_input, attention_x]) feed_forward_input = LayerNormalization(trainable=True)(attention_x) feed_forward_x = FeedForward(units=feed_forward_units, activation='relu', trainable=True)(feed_forward_input) feed_forward_x = Dropout(dropout_rate)(feed_forward_x) feed_forward_x = Add()([feed_forward_input, feed_forward_x]) block_output = LayerNormalization(trainable=True)(feed_forward_x) return block_output
def build_albert(token_num, pos_num=512, seq_len=512, embed_dim=128, hidden_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layers=None): """Get ALBERT model. See: https://arxiv.org/pdf/1909.11942.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param hidden_dim: Dimensions of hidden layers. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layers: A list of indices of output layers. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable # Build inputs input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token') input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment') inputs = [input_token, input_segment] # Build embeddings embed_token, embed_weights, embed_projection = AdaptiveEmbedding( input_dim=token_num, output_dim=hidden_dim, embed_dim=embed_dim, mask_zero=True, trainable=trainable, return_embeddings=True, return_projections=True, name='Embed-Token', )(input_token) embed_segment = keras.layers.Embedding( input_dim=2, output_dim=hidden_dim, trainable=trainable, name='Embed-Segment', )(input_segment) embed_layer = keras.layers.Add(name='Embed-Token-Segment')( [embed_token, embed_segment]) embed_layer = PositionEmbedding( input_dim=pos_num, output_dim=hidden_dim, mode=PositionEmbedding.MODE_ADD, trainable=trainable, name='Embedding-Position', )(embed_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) # Build shared transformer attention_layer = MultiHeadAttention( head_num=head_num, activation=attention_activation, name='Attention', ) attention_normal = LayerNormalization(name='Attention-Normal') feed_forward_layer = FeedForward(units=feed_forward_dim, activation=feed_forward_activation, name='Feed-Forward') feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal') transformed = embed_layer transformed_layers = [] for i in range(transformer_num): attention_input = transformed transformed = attention_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Attention-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Attention-Add-{}'.format(i + 1), )( [attention_input, transformed]) transformed = attention_normal(transformed) feed_forward_input = transformed transformed = feed_forward_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Feed-Forward-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Feed-Forward-Add-{}'.format(i + 1), )( [feed_forward_input, transformed]) transformed = feed_forward_normal(transformed) transformed_layers.append(transformed) if training: # Build tasks mlm_dense_layer = keras.layers.Dense( units=hidden_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = AdaptiveSoftmax( input_dim=hidden_dim, output_dim=token_num, embed_dim=embed_dim, bind_embeddings=True, bind_projections=True, name='MLM-Sim', )([mlm_norm_layer, embed_weights, embed_projection]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=hidden_dim, activation='tanh', name='SOP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='SOP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model if output_layers is not None: if isinstance(output_layers, list): output_layers = [ transformed_layers[index] for index in output_layers ] output = keras.layers.Concatenate(name='Output', )(output_layers) else: output = transformed_layers[output_layers] model = keras.models.Model(inputs=inputs, outputs=output) return model model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) return inputs, transformed
def build_transformer_xl(units, embed_dim, hidden_dim, num_token, num_block, num_head, batch_size, memory_len, target_len, dropout=0.0, attention_dropout=0.0, cutoffs=None, div_val=1, force_projection=None, bind_embeddings=True, bind_projections=True, clamp_len=None, share_biases=True): """Build transformer-XL model. :param units: Units inside the transformer. :param embed_dim: Dimension of embeddings. :param hidden_dim: Dimension inside position-wise feed-forward layer. :param num_token: Number of distinct input tokens. :param num_block: Number of basic encoder blocks. :param num_head: Number of heads for attention. :param batch_size: Maximum batch size. :param memory_len: The maximum length of memories. :param target_len: The length of prediction block. :param dropout: General dropout rate. :param attention_dropout: Dropout rate inside attention layer. :param cutoffs: Cutoffs of adaptive embedding. :param div_val: Scale factor of adaptive embedding. :param force_projection: Add projection when the dimensions are equal. :param bind_embeddings: Whether to bind embeddings to adaptive softmax. :param bind_projections: Whether to bind projections to adaptive softmax. :param clamp_len: The maximum value of relative position. :param share_biases: Whether to use the same biases for all layers. :return: The built model. """ token_input = keras.layers.Input(shape=(target_len,), name='Input-Token') memory_length_input = keras.layers.Input(shape=(1,), name='Input-Memory-Length') inputs = [token_input, memory_length_input] results = AdaptiveEmbedding( input_dim=num_token, output_dim=units, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, mask_zero=True, force_projection=force_projection, return_embeddings=True, return_projections=True, name='Embed-Token', )(token_input) token_embed, embedding_weights = results[0], results[1:] token_embed = Scale(scale=np.sqrt(units), name='Embed-Token-Scaled')(token_embed) last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-0', )([token_embed, memory_length_input]) position_embed = PositionalEmbedding( output_dim=units, clamp_len=clamp_len, name='Embed-Position', )([token_input, last_memory]) if 0.0 < dropout < 1.0: token_embed = keras.layers.Dropout(rate=dropout, name='Embed-Token-Dropped')(token_embed) position_embed = keras.layers.Dropout(rate=dropout, name='Embed-Position-Dropped')(position_embed) context_bias, relative_bias = None, None if share_biases: context_bias, relative_bias = RelativeBias(units=units, name='Biases')(last_memory) outputs = [token_embed] for i in range(num_block): block_input, block_output = outputs[-1], outputs[-1] if not share_biases: context_bias, relative_bias = RelativeBias(units=units, name='Biases-{}'.format(i + 1))(last_memory) block_output = RelativePartialMultiHeadSelfAttention( units=units, num_head=num_head, use_bias=False, attention_dropout=attention_dropout, name='Attention-{}'.format(i + 1), )([block_output, position_embed, last_memory, context_bias, relative_bias]) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout(rate=dropout, name='Attention-Dropped-{}'.format(i + 1))(block_output) block_output = keras.layers.Add(name='Attention-Res-{}'.format(i + 1))([block_input, block_output]) block_output = LayerNormalization(name='Attention-Norm-{}'.format(i + 1))(block_output) block_input = block_output block_output = FeedForward( units=hidden_dim, dropout_rate=dropout, name='FeedForward-{}'.format(i + 1), )(block_output) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout(rate=dropout, name='FeedForward-Dropped-{}'.format(i + 1))(block_output) block_output = keras.layers.Add(name='FeedForward-Res-{}'.format(i + 1))([block_input, block_output]) block_output = LayerNormalization(name='FeedForward-Norm-{}'.format(i + 1))(block_output) if i < num_block - 1: last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-{}'.format(i + 1), )([block_output, memory_length_input]) outputs.append(block_output) if 0.0 < dropout < 1.0: outputs[-1] = keras.layers.Dropout(rate=dropout, name='Output-Dropped')(outputs[-1]) softmax = AdaptiveSoftmax( input_dim=units, output_dim=num_token, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, force_projection=force_projection, bind_embeddings=bind_embeddings, bind_projections=bind_projections, name='Softmax', )(outputs[-1:] + embedding_weights) model = keras.models.Model(inputs=inputs, outputs=softmax) return model
def _wrap_layer(name, input_layer, build_func, dropout_rate=0.0, trainable=True, use_adapter=False, adapter_units=None, adapter_activation='relu', attention_mask=None, SEQ_LEN=None, retention_configuration=None, LAMBDA=None, FLAG_EXTRACT_LAYER=None, layer_idx=None, word_vector_elimination=None): """Wrap layers with residual, normalization and dropout. :param name: Prefix of names for internal layers. :param input_layer: Input layer. :param build_func: A callable that takes the input tensor and uenerates the output tensor. :param dropout_rate: Dropout rate. :param trainable: Whether the layers are trainable. :param use_adapter: Whether to use feed-forward adapters before each residual connections. :param adapter_units: The dimension of the first transformation in feed-forward adapter. :param adapter_activation: The activation after the first transformation in feed-forward adapter. :return: Output layer. """ if word_vector_elimination: [build_output, atten] = build_func(input_layer) else: build_output = build_func(input_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='%s-Dropout' % name, )(build_output) else: dropout_layer = build_output if isinstance(input_layer, list): input_layer = input_layer[0] if use_adapter: adapter = FeedForward( units=adapter_units, activation=adapter_activation, kernel_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001), name='%s-Adapter' % name, )(dropout_layer) dropout_layer = keras.layers.Add(name='%s-Adapter-Add' % name)([dropout_layer, adapter]) add_layer = keras.layers.Add(name='%s-Add' % name)([input_layer, dropout_layer]) normal_layer = LayerNormalization( trainable=trainable, name='%s-Norm' % name, )(add_layer) if word_vector_elimination: if FLAG_EXTRACT_LAYER == 1: extract_layer = Soft_Extract(atten=atten, LAMBDA=LAMBDA*(layer_idx**1.0), name='%s-Soft-Extract' % name)(normal_layer) return extract_layer, attention_mask elif FLAG_EXTRACT_LAYER == 2: extract_layer = Hard_Extract(atten=atten, index=retention_configuration[layer_idx-1], name='%s-Extract' % name)(normal_layer) attention_mask = attention_mask[:,:retention_configuration[layer_idx-1]] return extract_layer, attention_mask return normal_layer, attention_mask
def build_model(emb_cid, emb_advid, emb_aid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) inp3 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) emb3 = layers.Embedding(input_dim=emb_aid.shape[0], output_dim=emb_aid.shape[1], input_length=max_len, weights=[emb_aid], trainable=False)(inp3) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) emb3 = sdrop(emb3) content = layers.Concatenate()([emb1, emb2, emb3]) mha1 = MultiHeadAttention(head_num=32)(content) mha1 = layers.Dropout(0.01)(mha1) mha1 = layers.Add()([content, mha1]) mha1 = LayerNormalization()(mha1) mha1 = layers.Dropout(0.01)(mha1) mha1_ff = FeedForward(256)(mha1) mha1_out = layers.Add()([mha1, mha1_ff]) mha1_out = LayerNormalization()(mha1_out) mha2 = MultiHeadAttention(head_num=32)(mha1_out) mha2 = layers.Dropout(0.01)(mha2) mha2 = layers.Add()([mha1_out, mha2]) mha2 = LayerNormalization()(mha2) mha2 = layers.Dropout(0.01)(mha2) mha2_ff = FeedForward(256)(mha2) mha2_out = layers.Add()([mha2, mha2_ff]) mha2_out = LayerNormalization()(mha2_out) mha3 = MultiHeadAttention(head_num=32)(mha2_out) mha3 = layers.Dropout(0.01)(mha3) mha3 = layers.Add()([mha2_out, mha3]) mha3 = LayerNormalization()(mha3) mha3 = layers.Dropout(0.01)(mha3) mha3_ff = FeedForward(256)(mha3) mha3_out = layers.Add()([mha3, mha3_ff]) mha3_out = LayerNormalization()(mha3_out) avg_pool = layers.GlobalAveragePooling1D()(mha3_out) max_pool = layers.GlobalMaxPool1D()(mha3_out) x = layers.Concatenate()([avg_pool, max_pool]) x = layers.Dense(256)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) x = layers.Dense(128)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) x = layers.Dense(64)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) out = layers.Dense(1, activation='sigmoid')(x) model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out) model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(2e-4), metrics=['accuracy']) return model
def test_fit(self): input_layer = keras.layers.Input( shape=(1, 3), name='Input', ) att_layer = MultiHeadAttention( head_num=3, activation=self._leaky_relu, name='Multi-Head-Attention-1')(input_layer) normal_layer = LayerNormalization( name='Layer-Normalization-1', )(att_layer) feed_forward_layer = FeedForward( units=12, activation=self._leaky_relu, name='FeedForward', )(normal_layer) normal_layer = LayerNormalization( name='Layer-Normalization-2', )(feed_forward_layer) output_layer = keras.layers.Add(name='Add')( [input_layer, normal_layer]) model = keras.models.Model( inputs=input_layer, outputs=output_layer, ) model.compile( optimizer='adam', loss='mse', metrics={}, ) def _generator(batch_size=32): while True: batch_inputs = np.random.random((batch_size, 1, 3)) batch_outputs = batch_inputs + 0.2 yield batch_inputs, batch_outputs model.fit_generator( generator=_generator(), steps_per_epoch=1000, epochs=10, validation_data=_generator(), validation_steps=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) ], ) model_path = os.path.join( tempfile.gettempdir(), 'keras_feed_forward_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={ '_leaky_relu': self._leaky_relu, 'MultiHeadAttention': MultiHeadAttention, 'LayerNormalization': LayerNormalization, 'FeedForward': FeedForward, }, ) for inputs, _ in _generator(batch_size=3): predicts = model.predict(inputs) expect = inputs + 0.2 for i in range(3): for j in range(3): self.assertTrue( np.abs(expect[i, 0, j] - predicts[i, 0, j]) < 0.1, (expect, predicts)) break
def build_xlnet(units, training, num_token, num_block, num_head, hidden_dim, batch_size, memory_len, target_len, permute=None, mask_index=Tokenizer.SYM_PAD, dropout=0.0, attention_dropout=0.0, attention_type=ATTENTION_TYPE_BI, clamp_len=None, shared_biases=True): """Build XLNet. :param units: Hidden dimensions throughout the model. :param training: Whether in training mode. :param num_token: Number of distinct tokens. :param num_block: Number of basic encoder blocks. :param num_head: Number of heads for attention. :param hidden_dim: Dimension inside position-wise feed-forward layer. :param batch_size: Maximum batch size. :param memory_len: The maximum length of memories. :param target_len: The length of prediction block. :param permute: Whether to enable permutation. :param mask_index: The index of padding. :param dropout: General dropout rate. :param attention_dropout: Dropout rate inside attention layer. :param attention_type: 'uni' or 'bi'. :param clamp_len: The maximum value of relative position. :param shared_biases: Whether to use the same biases for all layers. :return: The built model. """ if permute is None: permute = training token_input = keras.layers.Input( shape=(target_len,), name='Input-Token', ) seg_input = keras.layers.Input( shape=(target_len,), name='Input-Segment', ) memory_length_input = keras.layers.Input( shape=(1,), name='Input-Memory-Length', ) inputs = [token_input, seg_input, memory_length_input] if training: query_input = keras.layers.Input( shape=(target_len,), name='Input-Mask', ) inputs.append(query_input) else: query_input = None token_embed, embed_weights = EmbeddingRet( input_dim=num_token, output_dim=units, mask_zero=mask_index == 0, name='Embed-Token', )(token_input) if mask_index is not None and mask_index != 0: masking = CreateMask( mask_value=mask_index, name='Masking', )(token_input) token_embed = RestoreMask(name='Embed-Token-Masked')([token_embed, masking]) if training: mask_embed = MaskEmbedding( units=units, name='Embed-Mask' )([token_embed, query_input]) else: mask_embed = None if 0.0 < dropout < 1.0: token_embed = keras.layers.Dropout( rate=dropout, name='Embed-Token-Dropout' )(token_embed) if training: mask_embed = keras.layers.Dropout( rate=dropout, name='Embed-Mask-Dropout' )(mask_embed) memories = [Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-0', )([token_embed, memory_length_input])] pos_embed = PositionalEmbedding( output_dim=units, clamp_len=clamp_len, directional=attention_type == 'uni', name='Embed-Pos', )([token_embed, memories[0]]) content_mask, query_mask = PermutationMask( enabled=permute, directional=attention_type == 'uni', name='Permutation', )([token_embed, memories[0]]) context_bias, relative_bias, segment_bias = None, None, None if shared_biases: context_bias, relative_bias = RelativeBias( units, name='Relative-Bias', )(memories[0]) segment_bias = SegmentBias( units, name='Segment-Bias', )(memories[0]) content_output, query_output = token_embed, None if training: query_output = mask_embed for i in range(num_block): if not shared_biases: context_bias, relative_bias = RelativeBias( units, name='Relative-Bias-{}'.format(i + 1), )(memories[i]) segment_bias = SegmentBias( units, name='Segment-Bias-{}'.format(i + 1), )(memories[i]) segment_mat, segment_embed = RelativeSegmentEmbedding( units=units, name='Embed-Segment-{}'.format(i + 1), )([seg_input, memories[i]]) attention = Attention( units=units, num_head=num_head, use_bias=False, attention_dropout=attention_dropout, name='Attention-{}'.format(i + 1), ) if 0.0 < dropout < 1.0: attention_dropout_layer = keras.layers.Dropout( rate=dropout, name='Attention-Dropout-{}'.format(i + 1), ) else: attention_dropout_layer = None attention_add = keras.layers.Add(name='Attention-Residual-{}'.format(i + 1)) attention_layer_norm = LayerNormalization(name='Attention-Normal-{}'.format(i + 1)) feed_forward = FeedForward( units=hidden_dim, dropout_rate=dropout, activation=gelu, name='FeedForward-{}'.format(i + 1), ) if 0.0 < dropout < 1.0: feed_forward_dropout = keras.layers.Dropout( rate=dropout, name='FeedForward-Dropout-{}'.format(i + 1), ) else: feed_forward_dropout = None feed_forward_add = keras.layers.Add(name='FeedForward-Residual-{}'.format(i + 1)) feed_forward_layer_norm = LayerNormalization(name='FeedForward-Normal-{}'.format(i + 1)) content = content_output def _build_output(query, mask): attention_input = query _output = attention([ query, content, memories[i], segment_mat, segment_embed, pos_embed, context_bias, relative_bias, segment_bias, mask, ]) if attention_dropout_layer is not None: _output = attention_dropout_layer(_output) _output = attention_add([attention_input, _output]) _output = attention_layer_norm(_output) feed_forward_input = _output _output = feed_forward(_output) if feed_forward_dropout is not None: _output = feed_forward_dropout(_output) _output = feed_forward_add([feed_forward_input, _output]) _output = feed_forward_layer_norm(_output) return _output content_output = _build_output(content_output, content_mask) if training: query_output = _build_output(query_output, query_mask) if i < num_block - 1: memories.append(Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-{}'.format(i + 1), )([content_output, memory_length_input])) if training: output = EmbeddingSim(name='Softmax')([query_output, embed_weights]) else: output = content_output model = keras.models.Model( inputs=inputs, outputs=output ) return model
def build_model(emb_cid, emb_advid, emb_aid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) inp3 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) emb3 = layers.Embedding(input_dim=emb_aid.shape[0], output_dim=emb_aid.shape[1], input_length=max_len, weights=[emb_aid], trainable=False)(inp3) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) emb3 = sdrop(emb3) id_c = emb1 id_adv_ad = layers.Concatenate()([emb2, emb3]) mha1 = MultiHeadAttention(head_num=16)(id_adv_ad) mha1 = layers.Dropout(0.01)(mha1) mha1 = layers.Add()([id_adv_ad, mha1]) mha1 = LayerNormalization()(mha1) mha1 = layers.Dropout(0.01)(mha1) mha1_ff = FeedForward(128)(mha1) mha1_out = layers.Add()([mha1, mha1_ff]) mha1_out = LayerNormalization()(mha1_out) id_adv_ad_lstm = layers.Bidirectional( layers.LSTM(200, return_sequences=True))(mha1_out) id_adv_ad_max_pool = layers.GlobalMaxPool1D()(id_adv_ad_lstm) mha2 = MultiHeadAttention(head_num=16)(id_c) mha2 = layers.Dropout(0.01)(mha2) mha2 = layers.Add()([id_c, mha2]) mha2 = LayerNormalization()(mha2) mha2 = layers.Dropout(0.01)(mha2) mha2_ff = FeedForward(128)(mha2) mha2_out = layers.Add()([mha2, mha2_ff]) mha2_out = LayerNormalization()(mha2_out) id_c_lstm = layers.Bidirectional(layers.LSTM( 200, return_sequences=True))(mha2_out) id_c_max_pool = layers.GlobalMaxPool1D()(id_c_lstm) x = layers.Add()([id_c_max_pool, id_adv_ad_max_pool]) x = layers.Dense(256, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dense(64, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dropout(0.15)(x) out = layers.Dense(10, activation='softmax')(x) model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(1e-3), metrics=['accuracy']) return model
def build_model(emb_cid, emb_advid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) inp_stacking = layers.Input(shape=(stacking_shape, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) sdrop = layers.SpatialDropout1D(rate=0.1) emb1 = sdrop(emb1) emb2 = sdrop(emb2) content = layers.Concatenate()([emb1, emb2]) mha1 = MultiHeadAttention(head_num=8)(content) # mha1 = layers.Dropout(0.01)(mha1) mha1 = layers.Add()([content, mha1]) mha1 = LayerNormalization()(mha1) # mha1 = layers.Dropout(0.01)(mha1) mha1_ff = FeedForward(128)(mha1) mha1_out = layers.Add()([mha1, mha1_ff]) mha1_out = LayerNormalization()(mha1_out) # mha2 = MultiHeadAttention(head_num=8)(mha1_out) # mha2 = layers.Dropout(0.01)(mha2) # mha2 = layers.Add()([mha1_out, mha2]) # mha2 = LayerNormalization()(mha2) # mha2 = layers.Dropout(0.01)(mha2) # mha2_ff = FeedForward(128)(mha2) # mha2_out = layers.Add()([mha2, mha2_ff]) # mha2_out = LayerNormalization()(mha2_out) # mha3 = MultiHeadAttention(head_num=8)(mha2_out) # mha3 = layers.Dropout(0.01)(mha3) # mha3 = layers.Add()([mha2_out, mha3]) # mha3 = LayerNormalization()(mha3) # mha3 = layers.Dropout(0.01)(mha3) # mha3_ff = FeedForward(128)(mha3) # mha3_out = layers.Add()([mha3, mha3_ff]) # mha3_out = LayerNormalization()(mha3_out) # avg_pool = layers.GlobalAveragePooling1D()(mha3_out) max_pool = layers.GlobalMaxPool1D()(mha1_out) x = layers.Concatenate()([max_pool, inp_stacking]) x = layers.Dense(128, activation='relu')(x) # x = layers.BatchNormalization()(x) x = layers.Dense(64, activation='relu')(x) # x = layers.BatchNormalization()(x) x = layers.Dense(32, activation='relu')(x) # x = layers.BatchNormalization()(x) # x = layers.Dropout(0.1)(x) out = layers.Dense(10, activation='softmax')(x) model = keras.Model(inputs=[inp1, inp2, inp_stacking], outputs=out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(5e-4), metrics=['accuracy']) return model
import keras from keras_position_wise_feed_forward import FeedForward import keras2onnx input_layer = keras.layers.Input(shape=(None, 32)) feed_forward_layer = FeedForward(units=128)(input_layer) model = keras.models.Model(inputs=input_layer, outputs=feed_forward_layer) model.compile(optimizer='adam', loss='mse') model.summary() #keras.backend.set_learning_phase(0) onnx_model = keras2onnx.convert_keras(model, 'feed_forward', debug_mode=1) keras2onnx.save_model(onnx_model, 'foo.onnx')