def __init__(self, num_hidden: int, prefix: str = 'lngru_', params: Optional[mx.rnn.RNNParams] = None, norm_scale: float = 1.0, norm_shift: float = 0.0) -> None: super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params) self._iN = LayerNormalization( num_hidden=num_hidden * 3, prefix="%si2h" % self._prefix, scale=self.params.get('i2h_scale', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('i2h_shift', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_shift))) self._hN = LayerNormalization( num_hidden=num_hidden * 3, prefix="%sh2h" % self._prefix, scale=self.params.get('h2h_scale', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('h2h_shift', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_shift))) self._shape_fix = None
def encoder(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') masks = K.equal(inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._encoder_stack): # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [encodings, encodings, encodings, masks] attention_out = attention(attention_input) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) return encodings, masks
def decoder(self, inputs): decoder_inputs, encoder_encodings, encoder_masks = inputs if K.dtype(decoder_inputs) != 'int32': decoder_inputs = K.cast(decoder_inputs, 'int32') decoder_masks = K.equal(decoder_inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, decoder_inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._decoder_stack): # Masked-Multi-head-Attention masked_attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) masked_attention_input = [ encodings, encodings, encodings, decoder_masks ] masked_attention_out = masked_attention(masked_attention_input) # Add & Norm masked_attention_out += encodings masked_attention_out = LayerNormalization()(masked_attention_out) # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [ masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks ] attention_out = attention(attention_input) # Add & Norm attention_out += masked_attention_out attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) # Pre-Softmax 与 Embeddings 共享参数 linear_projection = K.dot(encodings, K.transpose(self.embeddings)) outputs = K.softmax(linear_projection) return outputs
def dense_model(timesteps, n_class, n_features, classifier_architecture, dropout): inputs = Input((timesteps, n_features)) x = Dense(128, activation=Mish())(inputs) x = LayerNormalization()(x) x, a = attention_simple(x, timesteps) for d, dr in zip(classifier_architecture, dropout): x = Dropout(dr)(x) x = Dense(d, activation=Mish())(x) x = LayerNormalization()(x) outputs = Dense(n_class, activation="softmax")(x) model = Model(inputs, outputs) return model
def __init__(self, h=8, d_model=512, d_ff=2048, p_dropout=0.1, max_len=128): super().__init__() self.attn = MultiHeadAttention(h, d_model) self.dropout1 = Dropout(p_dropout) self.norm1 = LayerNormalization() self.ff = FFN(d_model, d_ff) self.dropout2 = Dropout(p_dropout) self.norm2 = LayerNormalization()
def __init__(self, n_feat, n_hid, n_latent, adj, dropout): super(GAE, self).__init__() #pdb.set_trace() self.gc1 = MyGraphConvolution(n_feat, n_hid, adj) #self.gc1 = InnerProductGraphConvolution(n_feat, n_hid, adj) self.ln1 = LayerNormalization(n_hid) self.gc2_mu = MyGraphConvolution(n_hid, n_latent, adj) #self.gc2_mu = InnerProductGraphConvolution(n_hid, n_latent, adj) self.ln2 = LayerNormalization(n_latent) self.gc2_var = MyGraphConvolution(n_hid, n_latent, adj) self.dropout = dropout self.sigmoid = nn.Sigmoid() self.fudge = 1e-7
def __init__(self, d_hid, d_inner_hid, dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1) # position-wise self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1) # position-wise self.layer_norm = LayerNormalization(d_hid) self.dropout = nn.Dropout(dropout) self.relu = nn.ReLU()
def __init__(self, d_model, seq_len, kernel_initializer='normal', **kwargs): super(XLnetLoss, self).__init__(**kwargs) self.supports_masking = True self.initializer = keras.initializers.get(kernel_initializer) self.max_seq_length = seq_len self.d_model = d_model self.dense = keras.layers.Dense(1, kernel_initializer=self.initializer) self.dense_0 = keras.layers.Dense(units=self.d_model, kernel_initializer=self.initializer, activation=keras.activations.tanh, name="dense_0") self.layer_norm = LayerNormalization() self.dense_1 = keras.layers.Dense(1, kernel_initializer=self.initializer, name="dense_1") self.dense_0_1 = keras.layers.Dense( self.d_model, activation=keras.activations.tanh, kernel_initializer=self.initializer, name="dense_0") self.dense_1_1 = keras.layers.Dense( 1, kernel_initializer=self.initializer, name="dense_1", use_bias=False)
def build(self, input_shape): self.embeddings = self.add_weight(shape=(self._vocab_size, self._model_dim), initializer='glorot_uniform', trainable=True, name="embeddings") self.EncoderPositionEncoding = PositionEncoding(self._model_dim) self.EncoderMultiHeadAttetions = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms0 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.EncoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms1 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.DecoderPositionEncoding = PositionEncoding(self._model_dim) self.DecoderMultiHeadAttetions0 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms0 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderMultiHeadAttetions1 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms1 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms2 = [ LayerNormalization() for _ in range(self._decoder_stack) ] super(Transformer, self).build(input_shape)
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k)) self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v)) self.attention = ScaledDotProductAttention(d_model) self.layer_norm = LayerNormalization(d_model) self.proj = nn.Linear(n_head * d_v, d_model) self.dropout = nn.Dropout(dropout) init.xavier_normal(self.w_qs) init.xavier_normal(self.w_ks) init.xavier_normal(self.w_vs)
def __init__(self, num_hidden: int, prefix: str = 'lnggru_', params: Optional[mx.rnn.RNNParams] = None, norm_scale: float = 1.0, norm_shift: float = 0.0) -> None: super(LayerNormPerGateGRUCell, self).__init__(num_hidden, prefix, params) self._norm_layers = list() # type: List[LayerNormalization] for name in ['r', 'z', 'o']: scale = self.params.get('%s_shift' % name, shape=(num_hidden, ), init=mx.init.Constant(value=norm_shift)) shift = self.params.get('%s_scale' % name, shape=(num_hidden, ), init=mx.init.Constant(value=norm_scale)) self._norm_layers.append( LayerNormalization(num_hidden, prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
def __init__(self, num_hidden: int, prefix: str = 'lnglstm_', params: Optional[mx.rnn.RNNParams] = None, forget_bias: float = 1.0, norm_scale: float = 1.0, norm_shift: float = 0.0) -> None: super(LayerNormPerGateLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias) self._norm_layers = list() # type: List[LayerNormalization] for name in ['i', 'f', 'c', 'o', 's']: scale = self.params.get('%s_shift' % name, shape=(num_hidden, ), init=mx.init.Constant(value=norm_shift)) shift = self.params.get( '%s_scale' % name, shape=(num_hidden, ), init=mx.init.Constant( value=norm_scale if name != "f" else forget_bias)) self._norm_layers.append( LayerNormalization(num_hidden, prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
def Smi2Smi(): #product l_in = layers.Input( shape= (None,)); l_mask = layers.Input( shape= (None,)); #reagents l_dec = layers.Input(shape =(None,)) ; l_dmask = layers.Input(shape =(None,)); #positional encodings for product and reagents, respectively l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask); l_dpos = PositionLayer(EMBEDDING_SIZE)(l_dmask); l_emask = MaskLayerRight()([l_dmask, l_mask]); l_right_mask = MaskLayerTriangular()(l_dmask); l_left_mask = MaskLayerLeft()(l_mask); #encoder l_voc = layers.Embedding(input_dim = vocab_size, output_dim = EMBEDDING_SIZE, input_length = None); l_embed = layers.Add()([ l_voc(l_in), l_pos]); l_embed = layers.Dropout(rate = 0.1)(l_embed); for layer in range(n_block): #self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE) ([l_embed, l_embed, l_embed, l_left_mask]) for i in range(n_self)]; l_con = layers.Concatenate()(l_o); l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con); l_drop = layers.Dropout(rate=0.1)(l_dense); l_add = layers.Add()( [l_drop, l_embed]); l_att = LayerNormalization()(l_add); #position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att); l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1); l_drop = layers.Dropout(rate = 0.1)(l_c2); l_ff = layers.Add()([l_att, l_drop]); l_embed = LayerNormalization()(l_ff); #bottleneck l_encoder = l_embed; l_embed = layers.Add()([l_voc(l_dec), l_dpos]); l_embed = layers.Dropout(rate = 0.1)(l_embed); for layer in range(n_block): #self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_embed, l_embed, l_embed, l_right_mask]) for i in range(n_self)]; l_con = layers.Concatenate()(l_o); l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con); l_drop = layers.Dropout(rate=0.1)(l_dense); l_add = layers.Add()( [l_drop, l_embed]); l_att = LayerNormalization()(l_add); #attention to the encoder l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_att, l_encoder, l_encoder, l_emask]) for i in range(n_self)]; l_con = layers.Concatenate()(l_o); l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con); l_drop = layers.Dropout(rate=0.1)(l_dense); l_add = layers.Add()( [l_drop, l_att]); l_att = LayerNormalization()(l_add); #position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att); l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1); l_drop = layers.Dropout(rate = 0.1)(l_c2); l_ff = layers.Add()([l_att, l_drop]); l_embed = LayerNormalization()(l_ff); l_out = layers.TimeDistributed(layers.Dense(vocab_size, use_bias=False)) (l_embed); mdl = tf.keras.Model([l_in, l_mask, l_dec, l_dmask], l_out); def masked_loss(y_true, y_pred): loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=y_pred); mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32'); loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1); loss = K.mean(loss); return loss; def masked_acc(y_true, y_pred): mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32'); eq = K.cast(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis = -1)), 'float32'); eq = tf.reduce_sum(eq * mask, -1) / tf.reduce_sum(mask, -1); eq = K.mean(eq); return eq; mdl.compile(optimizer = 'adam', loss = masked_loss, metrics=['accuracy', masked_acc]); mdl_enc = tf.keras.Model([l_in, l_mask], l_encoder); mdl_enc.compile(optimizer="adam", loss="categorical_crossentropy"); #mdl.summary(); return mdl, mdl_enc;
def buildNetwork(): unfreeze = False; l_in = layers.Input( shape= (None,)); l_mask = layers.Input( shape= (None,)); l_ymask = []; for i in range(len(props)): l_ymask.append( layers.Input( shape=(1, ))); #transformer part #positional encodings for product and reagents, respectively l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask); l_left_mask = MaskLayerLeft()(l_mask); #encoder l_voc = layers.Embedding(input_dim = vocab_size, output_dim = EMBEDDING_SIZE, input_length = None, trainable = unfreeze); l_embed = layers.Add()([ l_voc(l_in), l_pos]); for layer in range(n_block): #self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE, trainable= unfreeze) ([l_embed, l_embed, l_embed, l_left_mask]) for i in range(n_self)]; l_con = layers.Concatenate()(l_o); l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE, trainable = unfreeze), trainable = unfreeze) (l_con); if unfreeze == True: l_dense = layers.Dropout(rate=0.1)(l_dense); l_add = layers.Add()( [l_dense, l_embed]); l_att = LayerNormalization(trainable = unfreeze)(l_add); #position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu', trainable = unfreeze)(l_att); l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1, trainable = unfreeze)(l_c1); if unfreeze == True: l_c2 = layers.Dropout(rate=0.1)(l_c2); l_ff = layers.Add()([l_att, l_c2]); l_embed = LayerNormalization(trainable = unfreeze)(l_ff); #end of Transformer's part l_encoder = l_embed; #text-cnn part #https://github.com/deepchem/deepchem/blob/b7a6d3d759145d238eb8abaf76183e9dbd7b683c/deepchem/models/tensorgraph/models/text_cnn.py l_in2 = layers.Input( shape= (None,EMBEDDING_SIZE)); kernel_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]; num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160]; l_pool = []; for i in range(len(kernel_sizes)): l_conv = layers.Conv1D(num_filters[i], kernel_size=kernel_sizes[i], padding='valid', kernel_initializer='normal', activation='relu')(l_in2); l_maxpool = layers.Lambda(lambda x: tf.reduce_max(x, axis=1))(l_conv); l_pool.append(l_maxpool); l_cnn = layers.Concatenate(axis=1)(l_pool); l_cnn_drop = layers.Dropout(rate = 0.25)(l_cnn); #dense part l_dense =layers.Dense(N_HIDDEN_CNN, activation='relu') (l_cnn_drop); #https://github.com/ParikhKadam/Highway-Layer-Keras transform_gate = layers.Dense(units= N_HIDDEN_CNN, activation="sigmoid", bias_initializer=tf.keras.initializers.Constant(-1))(l_dense); carry_gate = layers.Lambda(lambda x: 1.0 - x, output_shape=(N_HIDDEN_CNN,))(transform_gate); transformed_data = layers.Dense(units= N_HIDDEN_CNN, activation="relu")(l_dense); transformed_gated = layers.Multiply()([transform_gate, transformed_data]); identity_gated = layers.Multiply()([carry_gate, l_dense]); l_highway = layers.Add()([transformed_gated, identity_gated]); #Because of multitask we have here a few different outputs and a custom loss. def mse_loss(prop): def loss(y_true, y_pred): y2 = y_true * l_ymask[prop] + y_pred * (1 - l_ymask[prop]); return tf.keras.losses.mse(y2, y_pred); return loss; def binary_loss(prop): def loss(y_true, y_pred): y_pred = tf.clip_by_value(y_pred, K.epsilon(), 1.0 - K.epsilon() ); r = y_true * K.log(y_pred) + (1.0 - y_true) * K.log(1.0 - y_pred); r = -tf.reduce_mean(r * l_ymask[prop] ); return r; return loss; l_out = []; losses = []; for prop in props: if props[prop][2] == "regression": l_out.append(layers.Dense(1, activation='linear', name="Regression-" + props[prop][1]) (l_highway)); losses.append(mse_loss(prop)); else: l_out.append(layers.Dense(1, activation='sigmoid', name="Classification-" + props[prop][1]) (l_highway)); losses.append(binary_loss(prop)); l_input = [l_in2]; l_input.extend(l_ymask); mdl = tf.keras.Model(l_input, l_out); mdl.compile (optimizer = 'adam', loss = losses); #mdl.summary(); K.set_value(mdl.optimizer.lr, 1.0e-4); #so far we do not train the encoder part of the model. encoder = tf.keras.Model([l_in, l_mask], l_encoder); encoder.compile(optimizer = 'adam', loss = 'mse'); encoder.set_weights(np.load("embeddings.npy", allow_pickle = True)); #encoder.summary(); return mdl, encoder;
def get_age_model(DATA): feed_forward_size = 2048 max_seq_len = 150 model_dim = 256 + 256 + 64 + 32 + 8 + 16 input_creative_id = Input(shape=(max_seq_len, ), name='creative_id') x1 = Embedding( input_dim=NUM_creative_id + 1, output_dim=256, weights=[DATA['creative_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_creative_id) # encodings = PositionEncoding(model_dim)(x1) # encodings = Add()([embeddings, encodings]) input_ad_id = Input(shape=(max_seq_len, ), name='ad_id') x2 = Embedding( input_dim=NUM_ad_id + 1, output_dim=256, weights=[DATA['ad_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_ad_id) input_product_id = Input(shape=(max_seq_len, ), name='product_id') x3 = Embedding( input_dim=NUM_product_id + 1, output_dim=32, weights=[DATA['product_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_product_id) input_advertiser_id = Input(shape=(max_seq_len, ), name='advertiser_id') x4 = Embedding( input_dim=NUM_advertiser_id + 1, output_dim=64, weights=[DATA['advertiser_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_advertiser_id) input_industry = Input(shape=(max_seq_len, ), name='industry') x5 = Embedding( input_dim=NUM_industry + 1, output_dim=16, weights=[DATA['industry_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_industry) input_product_category = Input(shape=(max_seq_len, ), name='product_category') x6 = Embedding( input_dim=NUM_product_category + 1, output_dim=8, weights=[DATA['product_category_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_product_category) # (bs, 100, 128*2) encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6]) # (bs, 100) masks = tf.equal(input_creative_id, 0) # (bs, 100, 128*2) attention_out = MultiHeadAttention( 8, 79)([encodings, encodings, encodings, masks]) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(model_dim, feed_forward_size) ff_out = ff(attention_out) # Add & Norm # ff_out (bs, 100, 128),但是attention_out是(bs,100,256) ff_out += attention_out encodings = LayerNormalization()(ff_out) encodings = GlobalMaxPooling1D()(encodings) encodings = Dropout(0.2)(encodings) # output_gender = Dense(2, activation='softmax', name='gender')(encodings) output_age = Dense(10, activation='softmax', name='age')(encodings) model = Model(inputs=[ input_creative_id, input_ad_id, input_product_id, input_advertiser_id, input_industry, input_product_category ], outputs=[output_age]) model.compile( optimizer=optimizers.Adam(2.5e-4), loss={ # 'gender': losses.CategoricalCrossentropy(from_logits=False), 'age': losses.CategoricalCrossentropy(from_logits=False) }, # loss_weights=[0.4, 0.6], metrics=['accuracy']) return model
def build_xlnet_for_tf_estimator(inputs, num_token, num_layer, num_head, embedding_dim, attention_head_dim, feed_forward_dim, target_len, is_training, memory_len=None, dropout=0.0, attention_dropout=0.0, attention_type=None, shared_biases=True): input_ids, input_mask, segment_ids, cls_index, \ p_mask, start_positions, end_positions, is_impossible = inputs attn_mask = get_attn_mask(input_mask) input_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x))(input_ids) token_embed = keras.layers.Embedding(input_dim=num_token, output_dim=embedding_dim, name='Embed-Token')(input_ids_trans) token_embed_dropout = keras.layers.Dropout(rate=dropout, name='Embed-Token-Dropout')( token_embed, training=is_training) pos_emb = get_pos_emb([input_ids_trans, token_embed]) pos_emb = keras.layers.Dropout(rate=dropout)(pos_emb, training=is_training) initializer = keras.initializers.get('normal') initializer.__setattr__("stddev", 0.02) segment_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x))( segment_ids) segment_mat, segment_embed = RelativeSegmentEmbedding( num_layer=num_layer, num_head=num_head, attention_dim=attention_head_dim, initializer=initializer, name='Embed-Segment', )(segment_ids_trans) r_w_bias, r_r_bias, r_s_bias = RelativeBias( num_layer=num_layer, num_head=num_head, attention_head_dim=attention_head_dim, bias_initializer=initializer, name='Relative-Bias', )(input_ids_trans) content_output = token_embed_dropout if FLAGS.short_cut_fake: attn_mask = tf.constant(1.0, shape=[512, 512, 1, 1], dtype=np.float32) segment_mat = tf.constant(1.0, shape=[512, 512, 1, 2], dtype=np.float32) pos_emb = tf.constant(1.0, shape=[1024, 1, 1024], dtype=np.float32) if FLAGS.short_cut_fuse: attn_mask_flat = tf.reshape(attn_mask, [-1]) segment_mat_flat = tf.reshape(segment_mat, [-1]) segment_embed_flat = tf.reshape(segment_embed, [-1]) pos_emb_flat = tf.reshape(pos_emb, [-1]) r_w_bias_flat = tf.reshape(r_w_bias, [-1]) r_r_bias_flat = tf.reshape(r_r_bias, [-1]) r_s_bias_flat = tf.reshape(r_s_bias, [-1]) fused = tf.concat([attn_mask_flat, segment_mat_flat, segment_embed_flat, \ pos_emb_flat, r_w_bias_flat, r_r_bias_flat, r_s_bias_flat], 0) for i in range(num_layer): attention = RelativeMultiHeadAttention( num_head=num_head, attention_head_dim=attention_head_dim, embedding_dim=embedding_dim, dropout=dropout, dropatt=attention_dropout, is_training=is_training, initializer=initializer, name='Attention-{}'.format(i + 1), ) attention_add = tf.keras.layers.Add( name='Attention-Residual-{}'.format(i + 1)) attention_layer_norm = LayerNormalization( name='Attention-Normal-{}'.format(i + 1)) feed_forward = FeedForward(feed_forward_dim=feed_forward_dim, embedding_dim=embedding_dim, dropout_rate=dropout, kernel_initializer=initializer, activation=gelu, name='FeedForward-{}'.format(i + 1)) feed_forward_add = tf.keras.layers.Add( name='FeedForward-Residual-{}'.format(i + 1)) feed_forward_layer_norm = LayerNormalization( name='FeedForward-Normal-{}'.format(i + 1)) segment_embed_i = keras.layers.Lambda(lambda x: x[i])(segment_embed) r_w_bias_i = keras.layers.Lambda(lambda x: x[i])(r_w_bias) r_r_bias_i = keras.layers.Lambda(lambda x: x[i])(r_r_bias) r_s_bias_i = keras.layers.Lambda(lambda x: x[i])(r_s_bias) if FLAGS.short_cut_fuse: attn_mask_flat, segment_mat_flat, segment_embed_flat, \ pos_emb_flat, r_w_bias_flat, r_r_bias_flat, r_s_bias_flat = \ tf.split(fused, [512*512*1, 512*512*2, 24*2*1024, \ 1024*1024, 24*1024, 24*1024, 24*1024], 0) attn_mask = tf.reshape(attn_mask_flat, [512, 512, 1, 1]) segment_mat = tf.reshape(segment_mat_flat, [512, 512, 1, 2]) segment_embed = tf.reshape(segment_embed_flat, [24, 2, 16, 64]) pos_emb = tf.reshape(pos_emb_flat, [1024, 1, 1024]) r_w_bias = tf.reshape(r_w_bias_flat, [24, 16, 64]) r_r_bias = tf.reshape(r_r_bias_flat, [24, 16, 64]) r_s_bias = tf.reshape(r_s_bias_flat, [24, 16, 64]) print(attn_mask, segment_mat, segment_embed, pos_emb, r_w_bias, r_r_bias, r_s_bias) def _build_output(query): attention_input = query _output = attention([ query, pos_emb, segment_embed_i, segment_mat, r_w_bias_i, r_r_bias_i, r_s_bias_i, attn_mask ]) _output = attention_add([attention_input, _output]) _output = attention_layer_norm(_output) feed_forward_input = keras.layers.Lambda(lambda x: K.identity(x))( _output) _output = feed_forward(_output, training=is_training) _output = feed_forward_add([feed_forward_input, _output]) _output = feed_forward_layer_norm(_output) return _output content_output = _build_output(content_output) output = keras.layers.Dropout(rate=dropout)(content_output, training=is_training) xlnet_loss = XLnetLoss(d_model=embedding_dim, seq_len=target_len, kernel_initializer=initializer, name="XLNET_LOSS")([ cls_index, start_positions, end_positions, is_impossible, p_mask, output ]) return xlnet_loss
def buildNetwork(nettype): unfreeze = False n_block, n_self = 3, 10 l_in = layers.Input(shape=(None, )) l_mask = layers.Input(shape=(None, )) #transformer part #positional encodings for product and reagents, respectively l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask) l_left_mask = MaskLayerLeft()(l_mask) #encoder l_voc = layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_SIZE, input_length=None, trainable=unfreeze) l_embed = layers.Add()([l_voc(l_in), l_pos]) for layer in range(n_block): #self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE, trainable=unfreeze)( [l_embed, l_embed, l_embed, l_left_mask]) for i in range(n_self) ] l_con = layers.Concatenate()(l_o) l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE, trainable=unfreeze), trainable=unfreeze)(l_con) if unfreeze == True: l_dense = layers.Dropout(rate=0.1)(l_dense) l_add = layers.Add()([l_dense, l_embed]) l_att = LayerNormalization(trainable=unfreeze)(l_add) #position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu', trainable=unfreeze)(l_att) l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1, trainable=unfreeze)(l_c1) if unfreeze == True: l_c2 = layers.Dropout(rate=0.1)(l_c2) l_ff = layers.Add()([l_att, l_c2]) l_embed = LayerNormalization(trainable=unfreeze)(l_ff) #end of Transformer's part l_encoder = l_embed #text-cnn part #https://github.com/deepchem/deepchem/blob/b7a6d3d759145d238eb8abaf76183e9dbd7b683c/deepchem/models/tensorgraph/models/text_cnn.py l_in2 = layers.Input(shape=(None, EMBEDDING_SIZE)) kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20] num_filters = [100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160] l_pool = [] for i in range(len(kernel_sizes)): l_conv = layers.Conv1D(num_filters[i], kernel_size=kernel_sizes[i], padding='valid', kernel_initializer='normal', activation='relu')(l_in2) l_maxpool = layers.Lambda(lambda x: tf.reduce_max(x, axis=1))(l_conv) l_pool.append(l_maxpool) l_cnn = layers.Concatenate(axis=1)(l_pool) l_cnn_drop = layers.Dropout(rate=0.25)(l_cnn) #dense part l_dense = layers.Dense(N_HIDDEN_CNN, activation='relu')(l_cnn_drop) #https://github.com/ParikhKadam/Highway-Layer-Keras transform_gate = layers.Dense( units=N_HIDDEN_CNN, activation="sigmoid", bias_initializer=tf.keras.initializers.Constant(-1))(l_dense) carry_gate = layers.Lambda(lambda x: 1.0 - x, output_shape=(N_HIDDEN_CNN, ))(transform_gate) transformed_data = layers.Dense(units=N_HIDDEN_CNN, activation="relu")(l_dense) transformed_gated = layers.Multiply()([transform_gate, transformed_data]) identity_gated = layers.Multiply()([carry_gate, l_dense]) l_highway = layers.Add()([transformed_gated, identity_gated]) if nettype == "regression": l_out = layers.Dense(1, activation='linear', name="Regression")(l_highway) mdl = tf.keras.Model([l_in2], l_out) mdl.compile(optimizer='adam', loss='mse', metrics=['mse']) else: l_out = layers.Dense(2, activation='softmax', name="Classification")(l_highway) mdl = tf.keras.Model([l_in2], l_out) mdl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) K.set_value(mdl.optimizer.lr, 1.0e-4) encoder = tf.keras.Model([l_in, l_mask], l_encoder) encoder.compile(optimizer='adam', loss='mse') encoder.set_weights(np.load("embeddings.npy", allow_pickle=True)) return mdl, encoder
def buildNetwork(n_block, n_self): print("Building network ...") # product l_in = layers.Input(shape=(None, )) l_mask = layers.Input(shape=(None, )) # reagents l_dec = layers.Input(shape=(None, )) l_dmask = layers.Input(shape=(None, )) # positional encodings for product and reagents, respectively l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask) l_dpos = PositionLayer(EMBEDDING_SIZE)(l_dmask) l_emask = MaskLayerRight()([l_dmask, l_mask]) l_right_mask = MaskLayerTriangular()(l_dmask) l_left_mask = MaskLayerLeft()(l_mask) # encoder l_voc = layers.Embedding(input_dim=vocab_size, output_dim=EMBEDDING_SIZE, input_length=None) l_embed = layers.Add()([l_voc(l_in), l_pos]) l_embed = layers.Dropout(rate=0.1)(l_embed) for layer in range(n_block): # self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_embed, l_embed, l_embed, l_left_mask]) for _ in range(n_self) ] l_con = layers.Concatenate()(l_o) l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con) l_drop = layers.Dropout(rate=0.1)(l_dense) l_add = layers.Add()([l_drop, l_embed]) l_att = LayerNormalization()(l_add) # position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att) l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1) l_drop = layers.Dropout(rate=0.1)(l_c2) l_ff = layers.Add()([l_att, l_drop]) l_embed = LayerNormalization()(l_ff) # bottleneck l_encoder = l_embed l_embed = layers.Add()([l_voc(l_dec), l_dpos]) l_embed = layers.Dropout(rate=0.1)(l_embed) for layer in range(n_block): # self attention l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_embed, l_embed, l_embed, l_right_mask]) for i in range(n_self) ] l_con = layers.Concatenate()(l_o) l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con) l_drop = layers.Dropout(rate=0.1)(l_dense) l_add = layers.Add()([l_drop, l_embed]) l_att = LayerNormalization()(l_add) # attention to the encoder l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_att, l_encoder, l_encoder, l_emask]) for i in range(n_self) ] l_con = layers.Concatenate()(l_o) l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con) l_drop = layers.Dropout(rate=0.1)(l_dense) l_add = layers.Add()([l_drop, l_att]) l_att = LayerNormalization()(l_add) # position-wise l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att) l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1) l_drop = layers.Dropout(rate=0.1)(l_c2) l_ff = layers.Add()([l_att, l_drop]) l_embed = LayerNormalization()(l_ff) l_out = layers.TimeDistributed(layers.Dense(vocab_size, use_bias=False))(l_embed) mdl = tf.keras.Model([l_in, l_mask, l_dec, l_dmask], l_out) def masked_loss(y_true, y_pred): loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=y_pred) mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32') loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1) loss = K.mean(loss) return loss def masked_acc(y_true, y_pred): mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32') eq = K.cast( K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), 'float32') eq = tf.reduce_sum(eq * mask, -1) / tf.reduce_sum(mask, -1) eq = K.mean(eq) return eq mdl.compile(optimizer='adam', loss=masked_loss, metrics=['accuracy', masked_acc]) mdl.summary() # Divide the graph for faster execution. First, we calculate encoder's values. # Then we use encoder's values and the product mask as additional decoder's input. def mdl_encoder(product): v = gen_left([product]) enc = l_encoder.eval(feed_dict={l_in: v[0], l_mask: v[1], l_pos: v[2]}) return enc, v[1] # And the decoder def mdl_decoder(res, product_encoded, product_mask, t=1.0): v = gen_right([res]) d = l_out.eval( feed_dict={ l_encoder: product_encoded, l_dec: v[0], l_dmask: v[1], l_mask: product_mask, l_dpos: v[2] }) prob = d[0, len(res), :] / t prob = np.exp(prob) / np.sum(np.exp(prob)) return prob return mdl, mdl_encoder, mdl_decoder
class LayerNormLSTMCell(mx.rnn.LSTMCell): """ Long-Short Term Memory (LSTM) network cell with layer normalization across gates. Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf) :param num_hidden: number of RNN hidden units. Number of units in output symbol. :param prefix: prefix for name of layers (and name of weight if params is None). :param params: RNNParams or None. Container for weight sharing between cells. Created if None. :param forget_bias: bias added to forget gate, default 1.0. Jozefowicz et al. 2015 recommends setting this to 1.0. :param norm_scale: scale/gain for layer normalization. :param norm_shift: shift/bias after layer normalization. """ def __init__(self, num_hidden: int, prefix: str = 'lnlstm_', params: Optional[mx.rnn.RNNParams] = None, forget_bias: float = 1.0, norm_scale: float = 1.0, norm_shift: float = 0.0) -> None: super(LayerNormLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias) self._iN = LayerNormalization( num_hidden=num_hidden * 4, prefix="%si2h" % self._prefix, scale=self.params.get('i2h_scale', shape=(num_hidden * 4, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('i2h_shift', shape=(num_hidden * 4, ), init=mx.init.Constant(value=norm_shift))) self._hN = LayerNormalization( num_hidden=num_hidden * 4, prefix="%sh2h" % self._prefix, scale=self.params.get('h2h_scale', shape=(num_hidden * 4, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('h2h_shift', shape=(num_hidden * 4, ), init=mx.init.Constant(value=norm_shift))) self._cN = LayerNormalization( num_hidden=num_hidden, prefix="%sc" % self._prefix, scale=self.params.get('c_scale', shape=(num_hidden, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('c_shift', shape=(num_hidden, ), init=mx.init.Constant(value=norm_shift))) self._shape_fix = None def __call__(self, inputs, states): self._counter += 1 name = '%st%d_' % (self._prefix, self._counter) i2h = mx.sym.FullyConnected(data=inputs, weight=self._iW, bias=self._iB, num_hidden=self._num_hidden * 4, name='%si2h' % name) if self._counter == 0: self._shape_fix = mx.sym.zeros_like(i2h) else: assert self._shape_fix is not None h2h = mx.sym.FullyConnected(data=states[0], weight=self._hW, bias=self._hB, num_hidden=self._num_hidden * 4, name='%sh2h' % name) gates = self._iN.normalize(i2h) + self._hN.normalize(self._shape_fix + h2h) # pylint: disable=unbalanced-tuple-unpacking in_gate, forget_gate, in_transform, out_gate = mx.sym.split( gates, num_outputs=4, axis=1, name="%sslice" % name) in_gate = mx.sym.Activation(in_gate, act_type="sigmoid", name='%si' % name) forget_gate = mx.sym.Activation(forget_gate, act_type="sigmoid", name='%sf' % name) in_transform = mx.sym.Activation(in_transform, act_type="tanh", name='%sc' % name) out_gate = mx.sym.Activation(out_gate, act_type="sigmoid", name='%so' % name) next_c = mx.sym._internal._plus(forget_gate * states[1], in_gate * in_transform, name='%sstate' % name) next_h = mx.sym._internal._mul(out_gate, mx.sym.Activation( self._cN.normalize(next_c), act_type="tanh"), name='%sout' % name) return next_h, [next_h, next_c]
def __init__(self, num_token, num_layer, num_head, embedding_dim, attention_head_dim, feed_forward_dim, target_len, is_training, memory_len=None, dropout=0.0, attention_dropout=0.0, attention_type=None, shared_biases=True): self.num_layer = num_layer self.dropout = dropout self.attention_dropout = attention_dropout self.token_embed = keras.layers.Embedding(input_dim=num_token, output_dim=embedding_dim, name='Embed-Token') initializer = keras.initializers.get('normal') initializer.__setattr__("stddev", 0.02) self.segment_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x)) self.segment_mat_embed = RelativeSegmentEmbedding( num_layer=num_layer, num_head=num_head, attention_dim=attention_head_dim, initializer=initializer, name='Embed-Segment') self.relative_bias = RelativeBias( num_layer=num_layer, num_head=num_head, attention_head_dim=attention_head_dim, bias_initializer=initializer, name='Relative-Bias') self.attention = [] self.attention_add = [] self.attention_layer_norm = [] self.feed_forward = [] self.feed_forward_add = [] self.feed_forward_layer_norm = [] for i in range(num_layer): self.attention.append( RelativeMultiHeadAttention( num_head=num_head, attention_head_dim=attention_head_dim, embedding_dim=embedding_dim, dropout=dropout, dropatt=attention_dropout, is_training=is_training, initializer=initializer, name='Attention-{}'.format(i + 1), )) self.attention_add.append( tf.keras.layers.Add(name='Attention-Residual-{}'.format(i + 1))) self.attention_layer_norm.append( LayerNormalization(name='Attention-Normal-{}'.format(i + 1))) self.feed_forward.append( FeedForward(feed_forward_dim=feed_forward_dim, embedding_dim=embedding_dim, dropout_rate=dropout, kernel_initializer=initializer, activation=gelu, name='FeedForward-{}'.format(i + 1))) self.feed_forward_add.append( tf.keras.layers.Add(name='FeedForward-Residual-{}'.format(i + 1))) self.feed_forward_layer_norm.append( LayerNormalization(name='FeedForward-Normal-{}'.format(i + 1))) self.xlnet_loss = XLnetLoss(d_model=embedding_dim, seq_len=target_len, kernel_initializer=initializer, name="XLNET_LOSS")
def build_transformer_xl(units, embed_dim, hidden_dim, num_token, num_block, num_head, batch_size, memory_len, target_len, dropout=0.0, attention_dropout=0.0, cutoffs=None, div_val=1, force_projection=None, bind_embeddings=True, bind_projections=True, clamp_len=None, share_biases=True): """Build transformer-XL model. :param units: Units inside the transformer. :param embed_dim: Dimension of embeddings. :param hidden_dim: Dimension inside position-wise feed-forward layer. :param num_token: Number of distinct input tokens. :param num_block: Number of basic encoder blocks. :param num_head: Number of heads for attention. :param batch_size: Maximum batch size. :param memory_len: The maximum length of memories. :param target_len: The length of prediction block. :param dropout: General dropout rate. :param attention_dropout: Dropout rate inside attention layer. :param cutoffs: Cutoffs of adaptive embedding. :param div_val: Scale factor of adaptive embedding. :param force_projection: Add projection when the dimensions are equal. :param bind_embeddings: Whether to bind embeddings to adaptive softmax. :param bind_projections: Whether to bind projections to adaptive softmax. :param clamp_len: The maximum value of relative position. :param share_biases: Whether to use the same biases for all layers. :return: The built model. """ token_input = keras.layers.Input(shape=(target_len, ), name='Input-Token') memory_length_input = keras.layers.Input(shape=(1, ), name='Input-Memory-Length') inputs = [token_input, memory_length_input] results = AdaptiveEmbedding( input_dim=num_token, output_dim=units, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, mask_zero=True, force_projection=force_projection, return_embeddings=True, return_projections=True, name='Embed-Token', )(token_input) token_embed, embedding_weights = results[0], results[1:] token_embed = Scale(scale=np.sqrt(units), name='Embed-Token-Scaled')(token_embed) last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-0', )([token_embed, memory_length_input]) position_embed = PositionalEmbedding( output_dim=units, clamp_len=clamp_len, name='Embed-Position', )([token_input, last_memory]) if 0.0 < dropout < 1.0: token_embed = keras.layers.Dropout( rate=dropout, name='Embed-Token-Dropped')(token_embed) position_embed = keras.layers.Dropout( rate=dropout, name='Embed-Position-Dropped')(position_embed) context_bias, relative_bias = None, None if share_biases: context_bias, relative_bias = RelativeBias(units=units, name='Biases')(last_memory) outputs = [token_embed] for i in range(num_block): block_input, block_output = outputs[-1], outputs[-1] if not share_biases: context_bias, relative_bias = RelativeBias( units=units, name='Biases-{}'.format(i + 1))(last_memory) block_output = RelativePartialMultiHeadSelfAttention( units=units, num_head=num_head, use_bias=False, attention_dropout=attention_dropout, name='Attention-{}'.format(i + 1), )([ block_output, position_embed, last_memory, context_bias, relative_bias ]) block_output = keras.layers.Add(name='Attention-Res-{}'.format(i + 1))( [block_input, block_output]) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout( rate=dropout, name='Attention-Dropped-{}'.format(i + 1))(block_output) block_output = LayerNormalization( name='Attention-Norm-{}'.format(i + 1))(block_output) block_input = block_output block_output = FeedForward( units=hidden_dim, dropout_rate=dropout, name='FeedForward-{}'.format(i + 1), )(block_output) block_output = keras.layers.Add( name='FeedForward-Res-{}'.format(i + 1))([block_input, block_output]) if 0.0 < dropout < 1.0: block_output = keras.layers.Dropout( rate=dropout, name='FeedForward-Dropped-{}'.format(i + 1))(block_output) block_output = LayerNormalization( name='FeedForward-Norm-{}'.format(i + 1))(block_output) if i < num_block - 1: last_memory = Memory( batch_size=batch_size, memory_len=memory_len, target_len=target_len, output_dim=units, name='Memory-{}'.format(i + 1), )([block_output, memory_length_input]) outputs.append(block_output) softmax = AdaptiveSoftmax( input_dim=units, output_dim=num_token, embed_dim=embed_dim, cutoffs=cutoffs, div_val=div_val, force_projection=force_projection, bind_embeddings=bind_embeddings, bind_projections=bind_projections, name='Softmax', )(outputs[-1:] + embedding_weights) model = keras.models.Model(inputs=inputs, outputs=softmax) return model
class LayerNormGRUCell(mx.rnn.GRUCell): """ Gated Recurrent Unit (GRU) network cell with layer normalization across gates. Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf) :param num_hidden: number of RNN hidden units. Number of units in output symbol. :param prefix: prefix for name of layers (and name of weight if params is None). :param params: RNNParams or None. Container for weight sharing between cells. Created if None. :param norm_scale: scale/gain for layer normalization. :param norm_shift: shift/bias after layer normalization. """ def __init__(self, num_hidden: int, prefix: str = 'lngru_', params: Optional[mx.rnn.RNNParams] = None, norm_scale: float = 1.0, norm_shift: float = 0.0) -> None: super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params) self._iN = LayerNormalization( num_hidden=num_hidden * 3, prefix="%si2h" % self._prefix, scale=self.params.get('i2h_scale', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('i2h_shift', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_shift))) self._hN = LayerNormalization( num_hidden=num_hidden * 3, prefix="%sh2h" % self._prefix, scale=self.params.get('h2h_scale', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_scale)), shift=self.params.get('h2h_shift', shape=(num_hidden * 3, ), init=mx.init.Constant(value=norm_shift))) self._shape_fix = None def __call__(self, inputs, states): self._counter += 1 seq_idx = self._counter name = '%st%d_' % (self._prefix, seq_idx) prev_state_h = states[0] i2h = mx.sym.FullyConnected(data=inputs, weight=self._iW, bias=self._iB, num_hidden=self._num_hidden * 3, name="%s_i2h" % name) h2h = mx.sym.FullyConnected(data=prev_state_h, weight=self._hW, bias=self._hB, num_hidden=self._num_hidden * 3, name="%s_h2h" % name) if self._counter == 0: self._shape_fix = mx.sym.zeros_like(i2h) else: assert self._shape_fix is not None i2h = self._iN.normalize(i2h) h2h = self._hN.normalize(self._shape_fix + h2h) # pylint: disable=unbalanced-tuple-unpacking i2h_r, i2h_z, i2h = mx.sym.split(i2h, num_outputs=3, name="%s_i2h_slice" % name) h2h_r, h2h_z, h2h = mx.sym.split(h2h, num_outputs=3, name="%s_h2h_slice" % name) reset_gate = mx.sym.Activation(i2h_r + h2h_r, act_type="sigmoid", name="%s_r_act" % name) update_gate = mx.sym.Activation(i2h_z + h2h_z, act_type="sigmoid", name="%s_z_act" % name) next_h_tmp = mx.sym.Activation(i2h + reset_gate * h2h, act_type="tanh", name="%s_h_act" % name) next_h = mx.sym._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h, name='%sout' % name) return next_h, [next_h]