예제 #1
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lngru_',
              params: Optional[mx.rnn.RNNParams] = None,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params)
     self._iN = LayerNormalization(
         num_hidden=num_hidden * 3,
         prefix="%si2h" % self._prefix,
         scale=self.params.get('i2h_scale',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_scale)),
         shift=self.params.get('i2h_shift',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_shift)))
     self._hN = LayerNormalization(
         num_hidden=num_hidden * 3,
         prefix="%sh2h" % self._prefix,
         scale=self.params.get('h2h_scale',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_scale)),
         shift=self.params.get('h2h_shift',
                               shape=(num_hidden * 3, ),
                               init=mx.init.Constant(value=norm_shift)))
     self._shape_fix = None
예제 #2
0
    def encoder(self, inputs):
        if K.dtype(inputs) != 'int32':
            inputs = K.cast(inputs, 'int32')

        masks = K.equal(inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, inputs)
        embeddings *= self._model_dim**0.5  # Scale
        # Position Encodings
        position_encodings = PositionEncoding(self._model_dim)(embeddings)
        # Embedings + Postion-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)

        for i in range(self._encoder_stack):
            # Multi-head-Attention
            attention = MultiHeadAttention(self._n_heads,
                                           self._model_dim // self._n_heads)
            attention_input = [encodings, encodings, encodings, masks]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += encodings
            attention_out = LayerNormalization()(attention_out)
            # Feed-Forward
            ff = PositionWiseFeedForward(self._model_dim,
                                         self._feed_forward_size)
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = LayerNormalization()(ff_out)

        return encodings, masks
예제 #3
0
    def decoder(self, inputs):
        decoder_inputs, encoder_encodings, encoder_masks = inputs
        if K.dtype(decoder_inputs) != 'int32':
            decoder_inputs = K.cast(decoder_inputs, 'int32')

        decoder_masks = K.equal(decoder_inputs, 0)
        # Embeddings
        embeddings = K.gather(self.embeddings, decoder_inputs)
        embeddings *= self._model_dim**0.5  # Scale
        # Position Encodings
        position_encodings = PositionEncoding(self._model_dim)(embeddings)
        # Embedings + Postion-encodings
        encodings = embeddings + position_encodings
        # Dropout
        encodings = K.dropout(encodings, self._dropout_rate)

        for i in range(self._decoder_stack):
            # Masked-Multi-head-Attention
            masked_attention = MultiHeadAttention(self._n_heads,
                                                  self._model_dim //
                                                  self._n_heads,
                                                  future=True)
            masked_attention_input = [
                encodings, encodings, encodings, decoder_masks
            ]
            masked_attention_out = masked_attention(masked_attention_input)
            # Add & Norm
            masked_attention_out += encodings
            masked_attention_out = LayerNormalization()(masked_attention_out)

            # Multi-head-Attention
            attention = MultiHeadAttention(self._n_heads,
                                           self._model_dim // self._n_heads)
            attention_input = [
                masked_attention_out, encoder_encodings, encoder_encodings,
                encoder_masks
            ]
            attention_out = attention(attention_input)
            # Add & Norm
            attention_out += masked_attention_out
            attention_out = LayerNormalization()(attention_out)

            # Feed-Forward
            ff = PositionWiseFeedForward(self._model_dim,
                                         self._feed_forward_size)
            ff_out = ff(attention_out)
            # Add & Norm
            ff_out += attention_out
            encodings = LayerNormalization()(ff_out)

        # Pre-Softmax 与 Embeddings 共享参数
        linear_projection = K.dot(encodings, K.transpose(self.embeddings))
        outputs = K.softmax(linear_projection)
        return outputs
예제 #4
0
def dense_model(timesteps, n_class, n_features, classifier_architecture,
                dropout):
    inputs = Input((timesteps, n_features))
    x = Dense(128, activation=Mish())(inputs)
    x = LayerNormalization()(x)
    x, a = attention_simple(x, timesteps)
    for d, dr in zip(classifier_architecture, dropout):
        x = Dropout(dr)(x)
        x = Dense(d, activation=Mish())(x)
        x = LayerNormalization()(x)
    outputs = Dense(n_class, activation="softmax")(x)
    model = Model(inputs, outputs)
    return model
예제 #5
0
 def __init__(self,
              h=8,
              d_model=512,
              d_ff=2048,
              p_dropout=0.1,
              max_len=128):
     super().__init__()
     self.attn = MultiHeadAttention(h, d_model)
     self.dropout1 = Dropout(p_dropout)
     self.norm1 = LayerNormalization()
     self.ff = FFN(d_model, d_ff)
     self.dropout2 = Dropout(p_dropout)
     self.norm2 = LayerNormalization()
예제 #6
0
    def __init__(self, n_feat, n_hid, n_latent, adj, dropout):
        super(GAE, self).__init__()

        #pdb.set_trace()
        self.gc1 = MyGraphConvolution(n_feat, n_hid, adj)
        #self.gc1 = InnerProductGraphConvolution(n_feat, n_hid, adj)
        self.ln1 = LayerNormalization(n_hid)
        self.gc2_mu = MyGraphConvolution(n_hid, n_latent, adj)
        #self.gc2_mu = InnerProductGraphConvolution(n_hid, n_latent, adj)
        self.ln2 = LayerNormalization(n_latent)
        self.gc2_var = MyGraphConvolution(n_hid, n_latent, adj)
        self.dropout = dropout
        self.sigmoid = nn.Sigmoid()
        self.fudge = 1e-7
예제 #7
0
 def __init__(self, d_hid, d_inner_hid, dropout=0.1):
     super(PositionwiseFeedForward, self).__init__()
     self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1)  # position-wise
     self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1)  # position-wise
     self.layer_norm = LayerNormalization(d_hid)
     self.dropout = nn.Dropout(dropout)
     self.relu = nn.ReLU()
예제 #8
0
 def __init__(self,
              d_model,
              seq_len,
              kernel_initializer='normal',
              **kwargs):
     super(XLnetLoss, self).__init__(**kwargs)
     self.supports_masking = True
     self.initializer = keras.initializers.get(kernel_initializer)
     self.max_seq_length = seq_len
     self.d_model = d_model
     self.dense = keras.layers.Dense(1, kernel_initializer=self.initializer)
     self.dense_0 = keras.layers.Dense(units=self.d_model,
                                     kernel_initializer=self.initializer,
                                     activation=keras.activations.tanh,
                                     name="dense_0")
     self.layer_norm = LayerNormalization()
     self.dense_1 = keras.layers.Dense(1, kernel_initializer=self.initializer, name="dense_1")
     self.dense_0_1 = keras.layers.Dense(
         self.d_model,
         activation=keras.activations.tanh,
         kernel_initializer=self.initializer, name="dense_0")
     self.dense_1_1 = keras.layers.Dense(
         1,
         kernel_initializer=self.initializer,
         name="dense_1",
         use_bias=False)
예제 #9
0
 def build(self, input_shape):
     self.embeddings = self.add_weight(shape=(self._vocab_size,
                                              self._model_dim),
                                       initializer='glorot_uniform',
                                       trainable=True,
                                       name="embeddings")
     self.EncoderPositionEncoding = PositionEncoding(self._model_dim)
     self.EncoderMultiHeadAttetions = [
         MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
         for _ in range(self._encoder_stack)
     ]
     self.EncoderLayerNorms0 = [
         LayerNormalization() for _ in range(self._encoder_stack)
     ]
     self.EncoderPositionWiseFeedForwards = [
         PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
         for _ in range(self._encoder_stack)
     ]
     self.EncoderLayerNorms1 = [
         LayerNormalization() for _ in range(self._encoder_stack)
     ]
     self.DecoderPositionEncoding = PositionEncoding(self._model_dim)
     self.DecoderMultiHeadAttetions0 = [
         MultiHeadAttention(self._n_heads,
                            self._model_dim // self._n_heads,
                            future=True) for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms0 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     self.DecoderMultiHeadAttetions1 = [
         MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads)
         for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms1 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     self.DecoderPositionWiseFeedForwards = [
         PositionWiseFeedForward(self._model_dim, self._feed_forward_size)
         for _ in range(self._decoder_stack)
     ]
     self.DecoderLayerNorms2 = [
         LayerNormalization() for _ in range(self._decoder_stack)
     ]
     super(Transformer, self).build(input_shape)
예제 #10
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization(d_model)
        self.proj = nn.Linear(n_head * d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        init.xavier_normal(self.w_qs)
        init.xavier_normal(self.w_ks)
        init.xavier_normal(self.w_vs)
예제 #11
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lnggru_',
              params: Optional[mx.rnn.RNNParams] = None,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormPerGateGRUCell, self).__init__(num_hidden, prefix,
                                                   params)
     self._norm_layers = list()  # type: List[LayerNormalization]
     for name in ['r', 'z', 'o']:
         scale = self.params.get('%s_shift' % name,
                                 shape=(num_hidden, ),
                                 init=mx.init.Constant(value=norm_shift))
         shift = self.params.get('%s_scale' % name,
                                 shape=(num_hidden, ),
                                 init=mx.init.Constant(value=norm_scale))
         self._norm_layers.append(
             LayerNormalization(num_hidden,
                                prefix="%s%s" % (self._prefix, name),
                                scale=scale,
                                shift=shift))
예제 #12
0
 def __init__(self,
              num_hidden: int,
              prefix: str = 'lnglstm_',
              params: Optional[mx.rnn.RNNParams] = None,
              forget_bias: float = 1.0,
              norm_scale: float = 1.0,
              norm_shift: float = 0.0) -> None:
     super(LayerNormPerGateLSTMCell, self).__init__(num_hidden, prefix,
                                                    params, forget_bias)
     self._norm_layers = list()  # type: List[LayerNormalization]
     for name in ['i', 'f', 'c', 'o', 's']:
         scale = self.params.get('%s_shift' % name,
                                 shape=(num_hidden, ),
                                 init=mx.init.Constant(value=norm_shift))
         shift = self.params.get(
             '%s_scale' % name,
             shape=(num_hidden, ),
             init=mx.init.Constant(
                 value=norm_scale if name != "f" else forget_bias))
         self._norm_layers.append(
             LayerNormalization(num_hidden,
                                prefix="%s%s" % (self._prefix, name),
                                scale=scale,
                                shift=shift))
예제 #13
0
def Smi2Smi():

    #product
    l_in = layers.Input( shape= (None,));
    l_mask = layers.Input( shape= (None,));

    #reagents
    l_dec = layers.Input(shape =(None,)) ;
    l_dmask = layers.Input(shape =(None,));

    #positional encodings for product and reagents, respectively
    l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask);
    l_dpos = PositionLayer(EMBEDDING_SIZE)(l_dmask);

    l_emask = MaskLayerRight()([l_dmask, l_mask]);
    l_right_mask = MaskLayerTriangular()(l_dmask);
    l_left_mask = MaskLayerLeft()(l_mask);

    #encoder
    l_voc = layers.Embedding(input_dim = vocab_size, output_dim = EMBEDDING_SIZE, input_length = None);

    l_embed = layers.Add()([ l_voc(l_in), l_pos]);
    l_embed = layers.Dropout(rate = 0.1)(l_embed);

    for layer in range(n_block):

       #self attention
       l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE) ([l_embed, l_embed, l_embed, l_left_mask]) for i in range(n_self)];

       l_con = layers.Concatenate()(l_o);
       l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con);
       l_drop = layers.Dropout(rate=0.1)(l_dense);
       l_add = layers.Add()( [l_drop, l_embed]);
       l_att = LayerNormalization()(l_add);

       #position-wise
       l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att);
       l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1);
       l_drop = layers.Dropout(rate = 0.1)(l_c2);
       l_ff = layers.Add()([l_att, l_drop]);
       l_embed = LayerNormalization()(l_ff);

    #bottleneck
    l_encoder = l_embed;

    l_embed = layers.Add()([l_voc(l_dec), l_dpos]);
    l_embed = layers.Dropout(rate = 0.1)(l_embed);

    for layer in range(n_block):

       #self attention
       l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_embed, l_embed, l_embed, l_right_mask]) for i in range(n_self)];

       l_con = layers.Concatenate()(l_o);
       l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con);
       l_drop = layers.Dropout(rate=0.1)(l_dense);
       l_add = layers.Add()( [l_drop, l_embed]);
       l_att = LayerNormalization()(l_add);

       #attention to the encoder
       l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE)([l_att, l_encoder, l_encoder, l_emask]) for i in range(n_self)];
       l_con = layers.Concatenate()(l_o);
       l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE)) (l_con);
       l_drop = layers.Dropout(rate=0.1)(l_dense);
       l_add = layers.Add()( [l_drop, l_att]);
       l_att = LayerNormalization()(l_add);

       #position-wise
       l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att);
       l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1);
       l_drop = layers.Dropout(rate = 0.1)(l_c2);
       l_ff = layers.Add()([l_att, l_drop]);
       l_embed = LayerNormalization()(l_ff);

    l_out = layers.TimeDistributed(layers.Dense(vocab_size,
                                          use_bias=False)) (l_embed);

    mdl = tf.keras.Model([l_in, l_mask, l_dec, l_dmask], l_out);

    def masked_loss(y_true, y_pred):
       loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=y_pred);
       mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32');
       loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1);
       loss = K.mean(loss);
       return loss;

    def masked_acc(y_true, y_pred):
       mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32');
       eq = K.cast(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis = -1)), 'float32');
       eq = tf.reduce_sum(eq * mask, -1) / tf.reduce_sum(mask, -1);
       eq = K.mean(eq);
       return eq;

    mdl.compile(optimizer = 'adam', loss = masked_loss, metrics=['accuracy', masked_acc]);

    mdl_enc = tf.keras.Model([l_in, l_mask], l_encoder);
    mdl_enc.compile(optimizer="adam", loss="categorical_crossentropy");

    #mdl.summary();

    return mdl, mdl_enc;
예제 #14
0
def buildNetwork():

    unfreeze = False;

    l_in = layers.Input( shape= (None,));
    l_mask = layers.Input( shape= (None,));

    l_ymask = [];
    for i in range(len(props)):
       l_ymask.append( layers.Input( shape=(1, )));

    #transformer part
    #positional encodings for product and reagents, respectively
    l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask);
    l_left_mask = MaskLayerLeft()(l_mask);

    #encoder
    l_voc = layers.Embedding(input_dim = vocab_size, output_dim = EMBEDDING_SIZE, input_length = None, trainable = unfreeze);
    l_embed = layers.Add()([ l_voc(l_in), l_pos]);

    for layer in range(n_block):

       #self attention
       l_o = [ SelfLayer(EMBEDDING_SIZE, KEY_SIZE, trainable= unfreeze) ([l_embed, l_embed, l_embed, l_left_mask]) for i in range(n_self)];

       l_con = layers.Concatenate()(l_o);
       l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE, trainable = unfreeze), trainable = unfreeze) (l_con);
       if unfreeze == True: l_dense = layers.Dropout(rate=0.1)(l_dense);
       l_add = layers.Add()( [l_dense, l_embed]);
       l_att = LayerNormalization(trainable = unfreeze)(l_add);

       #position-wise
       l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu', trainable = unfreeze)(l_att);
       l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1, trainable = unfreeze)(l_c1);
       if unfreeze == True: l_c2 = layers.Dropout(rate=0.1)(l_c2);
       l_ff = layers.Add()([l_att, l_c2]);
       l_embed = LayerNormalization(trainable = unfreeze)(l_ff);

    #end of Transformer's part
    l_encoder = l_embed;

    #text-cnn part
    #https://github.com/deepchem/deepchem/blob/b7a6d3d759145d238eb8abaf76183e9dbd7b683c/deepchem/models/tensorgraph/models/text_cnn.py

    l_in2 =  layers.Input( shape= (None,EMBEDDING_SIZE));

    kernel_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20];
    num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160];

    l_pool = [];
    for i in range(len(kernel_sizes)):
       l_conv = layers.Conv1D(num_filters[i], kernel_size=kernel_sizes[i], padding='valid',
                              kernel_initializer='normal', activation='relu')(l_in2);
       l_maxpool = layers.Lambda(lambda x: tf.reduce_max(x, axis=1))(l_conv);
       l_pool.append(l_maxpool);

    l_cnn = layers.Concatenate(axis=1)(l_pool);
    l_cnn_drop = layers.Dropout(rate = 0.25)(l_cnn);

    #dense part
    l_dense =layers.Dense(N_HIDDEN_CNN, activation='relu') (l_cnn_drop);

    #https://github.com/ParikhKadam/Highway-Layer-Keras
    transform_gate = layers.Dense(units= N_HIDDEN_CNN, activation="sigmoid",
                     bias_initializer=tf.keras.initializers.Constant(-1))(l_dense);

    carry_gate = layers.Lambda(lambda x: 1.0 - x, output_shape=(N_HIDDEN_CNN,))(transform_gate);
    transformed_data = layers.Dense(units= N_HIDDEN_CNN, activation="relu")(l_dense);
    transformed_gated = layers.Multiply()([transform_gate, transformed_data]);
    identity_gated = layers.Multiply()([carry_gate, l_dense]);

    l_highway = layers.Add()([transformed_gated, identity_gated]);

    #Because of multitask we have here a few different outputs and a custom loss.

    def mse_loss(prop):
       def loss(y_true, y_pred):
          y2 = y_true * l_ymask[prop] + y_pred * (1 - l_ymask[prop]);
          return tf.keras.losses.mse(y2, y_pred);
       return loss;

    def binary_loss(prop):
       def loss(y_true, y_pred):
           y_pred = tf.clip_by_value(y_pred, K.epsilon(), 1.0 - K.epsilon() );
           r = y_true * K.log(y_pred) + (1.0 - y_true) * K.log(1.0 - y_pred);
           r = -tf.reduce_mean(r * l_ymask[prop] );
           return r;
       return loss;

    l_out = [];
    losses = [];
    for prop in props:
       if props[prop][2] == "regression":
          l_out.append(layers.Dense(1, activation='linear', name="Regression-" + props[prop][1]) (l_highway));
          losses.append(mse_loss(prop));
       else:
          l_out.append(layers.Dense(1, activation='sigmoid', name="Classification-" + props[prop][1]) (l_highway));
          losses.append(binary_loss(prop));

    l_input = [l_in2];
    l_input.extend(l_ymask);

    mdl = tf.keras.Model(l_input, l_out);
    mdl.compile (optimizer = 'adam', loss = losses);

    #mdl.summary();

    K.set_value(mdl.optimizer.lr, 1.0e-4);

    #so far we do not train the encoder part of the model.
    encoder = tf.keras.Model([l_in, l_mask], l_encoder);
    encoder.compile(optimizer = 'adam', loss = 'mse');
    encoder.set_weights(np.load("embeddings.npy", allow_pickle = True));

    #encoder.summary();

    return mdl, encoder;
예제 #15
0
파일: ans.py 프로젝트: wyj-fps/TAAC2020
def get_age_model(DATA):

    feed_forward_size = 2048
    max_seq_len = 150
    model_dim = 256 + 256 + 64 + 32 + 8 + 16

    input_creative_id = Input(shape=(max_seq_len, ), name='creative_id')
    x1 = Embedding(
        input_dim=NUM_creative_id + 1,
        output_dim=256,
        weights=[DATA['creative_id_emb']],
        trainable=args.not_train_embedding,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_creative_id)
    # encodings = PositionEncoding(model_dim)(x1)
    # encodings = Add()([embeddings, encodings])

    input_ad_id = Input(shape=(max_seq_len, ), name='ad_id')
    x2 = Embedding(
        input_dim=NUM_ad_id + 1,
        output_dim=256,
        weights=[DATA['ad_id_emb']],
        trainable=args.not_train_embedding,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_ad_id)

    input_product_id = Input(shape=(max_seq_len, ), name='product_id')
    x3 = Embedding(
        input_dim=NUM_product_id + 1,
        output_dim=32,
        weights=[DATA['product_id_emb']],
        trainable=args.not_train_embedding,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_product_id)

    input_advertiser_id = Input(shape=(max_seq_len, ), name='advertiser_id')
    x4 = Embedding(
        input_dim=NUM_advertiser_id + 1,
        output_dim=64,
        weights=[DATA['advertiser_id_emb']],
        trainable=args.not_train_embedding,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_advertiser_id)

    input_industry = Input(shape=(max_seq_len, ), name='industry')
    x5 = Embedding(
        input_dim=NUM_industry + 1,
        output_dim=16,
        weights=[DATA['industry_emb']],
        trainable=True,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_industry)

    input_product_category = Input(shape=(max_seq_len, ),
                                   name='product_category')
    x6 = Embedding(
        input_dim=NUM_product_category + 1,
        output_dim=8,
        weights=[DATA['product_category_emb']],
        trainable=True,
        #    trainable=False,
        input_length=150,
        mask_zero=True)(input_product_category)

    # (bs, 100, 128*2)
    encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6])
    # (bs, 100)
    masks = tf.equal(input_creative_id, 0)

    # (bs, 100, 128*2)
    attention_out = MultiHeadAttention(
        8, 79)([encodings, encodings, encodings, masks])

    # Add & Norm
    attention_out += encodings
    attention_out = LayerNormalization()(attention_out)
    # Feed-Forward
    ff = PositionWiseFeedForward(model_dim, feed_forward_size)
    ff_out = ff(attention_out)
    # Add & Norm
    # ff_out (bs, 100, 128),但是attention_out是(bs,100,256)
    ff_out += attention_out
    encodings = LayerNormalization()(ff_out)
    encodings = GlobalMaxPooling1D()(encodings)
    encodings = Dropout(0.2)(encodings)

    # output_gender = Dense(2, activation='softmax', name='gender')(encodings)
    output_age = Dense(10, activation='softmax', name='age')(encodings)

    model = Model(inputs=[
        input_creative_id, input_ad_id, input_product_id, input_advertiser_id,
        input_industry, input_product_category
    ],
                  outputs=[output_age])

    model.compile(
        optimizer=optimizers.Adam(2.5e-4),
        loss={
            # 'gender': losses.CategoricalCrossentropy(from_logits=False),
            'age': losses.CategoricalCrossentropy(from_logits=False)
        },
        # loss_weights=[0.4, 0.6],
        metrics=['accuracy'])
    return model
예제 #16
0
def build_xlnet_for_tf_estimator(inputs,
                                 num_token,
                                 num_layer,
                                 num_head,
                                 embedding_dim,
                                 attention_head_dim,
                                 feed_forward_dim,
                                 target_len,
                                 is_training,
                                 memory_len=None,
                                 dropout=0.0,
                                 attention_dropout=0.0,
                                 attention_type=None,
                                 shared_biases=True):
    input_ids, input_mask, segment_ids, cls_index, \
    p_mask, start_positions, end_positions, is_impossible = inputs

    attn_mask = get_attn_mask(input_mask)

    input_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x))(input_ids)
    token_embed = keras.layers.Embedding(input_dim=num_token,
                                         output_dim=embedding_dim,
                                         name='Embed-Token')(input_ids_trans)
    token_embed_dropout = keras.layers.Dropout(rate=dropout,
                                               name='Embed-Token-Dropout')(
                                                   token_embed,
                                                   training=is_training)

    pos_emb = get_pos_emb([input_ids_trans, token_embed])
    pos_emb = keras.layers.Dropout(rate=dropout)(pos_emb, training=is_training)

    initializer = keras.initializers.get('normal')
    initializer.__setattr__("stddev", 0.02)

    segment_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x))(
        segment_ids)
    segment_mat, segment_embed = RelativeSegmentEmbedding(
        num_layer=num_layer,
        num_head=num_head,
        attention_dim=attention_head_dim,
        initializer=initializer,
        name='Embed-Segment',
    )(segment_ids_trans)

    r_w_bias, r_r_bias, r_s_bias = RelativeBias(
        num_layer=num_layer,
        num_head=num_head,
        attention_head_dim=attention_head_dim,
        bias_initializer=initializer,
        name='Relative-Bias',
    )(input_ids_trans)

    content_output = token_embed_dropout
    if FLAGS.short_cut_fake:
        attn_mask = tf.constant(1.0, shape=[512, 512, 1, 1], dtype=np.float32)
        segment_mat = tf.constant(1.0,
                                  shape=[512, 512, 1, 2],
                                  dtype=np.float32)
        pos_emb = tf.constant(1.0, shape=[1024, 1, 1024], dtype=np.float32)
    if FLAGS.short_cut_fuse:
        attn_mask_flat = tf.reshape(attn_mask, [-1])
        segment_mat_flat = tf.reshape(segment_mat, [-1])
        segment_embed_flat = tf.reshape(segment_embed, [-1])
        pos_emb_flat = tf.reshape(pos_emb, [-1])
        r_w_bias_flat = tf.reshape(r_w_bias, [-1])
        r_r_bias_flat = tf.reshape(r_r_bias, [-1])
        r_s_bias_flat = tf.reshape(r_s_bias, [-1])
        fused = tf.concat([attn_mask_flat, segment_mat_flat, segment_embed_flat, \
                  pos_emb_flat, r_w_bias_flat, r_r_bias_flat, r_s_bias_flat], 0)

    for i in range(num_layer):
        attention = RelativeMultiHeadAttention(
            num_head=num_head,
            attention_head_dim=attention_head_dim,
            embedding_dim=embedding_dim,
            dropout=dropout,
            dropatt=attention_dropout,
            is_training=is_training,
            initializer=initializer,
            name='Attention-{}'.format(i + 1),
        )

        attention_add = tf.keras.layers.Add(
            name='Attention-Residual-{}'.format(i + 1))
        attention_layer_norm = LayerNormalization(
            name='Attention-Normal-{}'.format(i + 1))

        feed_forward = FeedForward(feed_forward_dim=feed_forward_dim,
                                   embedding_dim=embedding_dim,
                                   dropout_rate=dropout,
                                   kernel_initializer=initializer,
                                   activation=gelu,
                                   name='FeedForward-{}'.format(i + 1))
        feed_forward_add = tf.keras.layers.Add(
            name='FeedForward-Residual-{}'.format(i + 1))
        feed_forward_layer_norm = LayerNormalization(
            name='FeedForward-Normal-{}'.format(i + 1))

        segment_embed_i = keras.layers.Lambda(lambda x: x[i])(segment_embed)
        r_w_bias_i = keras.layers.Lambda(lambda x: x[i])(r_w_bias)
        r_r_bias_i = keras.layers.Lambda(lambda x: x[i])(r_r_bias)
        r_s_bias_i = keras.layers.Lambda(lambda x: x[i])(r_s_bias)
        if FLAGS.short_cut_fuse:
            attn_mask_flat, segment_mat_flat, segment_embed_flat, \
              pos_emb_flat, r_w_bias_flat, r_r_bias_flat, r_s_bias_flat = \
                tf.split(fused, [512*512*1, 512*512*2, 24*2*1024, \
                         1024*1024, 24*1024, 24*1024, 24*1024], 0)
            attn_mask = tf.reshape(attn_mask_flat, [512, 512, 1, 1])
            segment_mat = tf.reshape(segment_mat_flat, [512, 512, 1, 2])
            segment_embed = tf.reshape(segment_embed_flat, [24, 2, 16, 64])
            pos_emb = tf.reshape(pos_emb_flat, [1024, 1, 1024])
            r_w_bias = tf.reshape(r_w_bias_flat, [24, 16, 64])
            r_r_bias = tf.reshape(r_r_bias_flat, [24, 16, 64])
            r_s_bias = tf.reshape(r_s_bias_flat, [24, 16, 64])
            print(attn_mask, segment_mat, segment_embed, pos_emb, r_w_bias,
                  r_r_bias, r_s_bias)

        def _build_output(query):
            attention_input = query
            _output = attention([
                query, pos_emb, segment_embed_i, segment_mat, r_w_bias_i,
                r_r_bias_i, r_s_bias_i, attn_mask
            ])
            _output = attention_add([attention_input, _output])
            _output = attention_layer_norm(_output)
            feed_forward_input = keras.layers.Lambda(lambda x: K.identity(x))(
                _output)
            _output = feed_forward(_output, training=is_training)
            _output = feed_forward_add([feed_forward_input, _output])
            _output = feed_forward_layer_norm(_output)
            return _output

        content_output = _build_output(content_output)

    output = keras.layers.Dropout(rate=dropout)(content_output,
                                                training=is_training)

    xlnet_loss = XLnetLoss(d_model=embedding_dim,
                           seq_len=target_len,
                           kernel_initializer=initializer,
                           name="XLNET_LOSS")([
                               cls_index, start_positions, end_positions,
                               is_impossible, p_mask, output
                           ])

    return xlnet_loss
예제 #17
0
def buildNetwork(nettype):

    unfreeze = False
    n_block, n_self = 3, 10

    l_in = layers.Input(shape=(None, ))
    l_mask = layers.Input(shape=(None, ))

    #transformer part
    #positional encodings for product and reagents, respectively
    l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask)
    l_left_mask = MaskLayerLeft()(l_mask)

    #encoder
    l_voc = layers.Embedding(input_dim=vocab_size,
                             output_dim=EMBEDDING_SIZE,
                             input_length=None,
                             trainable=unfreeze)
    l_embed = layers.Add()([l_voc(l_in), l_pos])

    for layer in range(n_block):

        #self attention
        l_o = [
            SelfLayer(EMBEDDING_SIZE, KEY_SIZE, trainable=unfreeze)(
                [l_embed, l_embed, l_embed, l_left_mask])
            for i in range(n_self)
        ]

        l_con = layers.Concatenate()(l_o)
        l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE,
                                                      trainable=unfreeze),
                                         trainable=unfreeze)(l_con)
        if unfreeze == True: l_dense = layers.Dropout(rate=0.1)(l_dense)
        l_add = layers.Add()([l_dense, l_embed])
        l_att = LayerNormalization(trainable=unfreeze)(l_add)

        #position-wise
        l_c1 = layers.Conv1D(N_HIDDEN,
                             1,
                             activation='relu',
                             trainable=unfreeze)(l_att)
        l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1, trainable=unfreeze)(l_c1)
        if unfreeze == True: l_c2 = layers.Dropout(rate=0.1)(l_c2)
        l_ff = layers.Add()([l_att, l_c2])
        l_embed = LayerNormalization(trainable=unfreeze)(l_ff)

    #end of Transformer's part
    l_encoder = l_embed

    #text-cnn part
    #https://github.com/deepchem/deepchem/blob/b7a6d3d759145d238eb8abaf76183e9dbd7b683c/deepchem/models/tensorgraph/models/text_cnn.py

    l_in2 = layers.Input(shape=(None, EMBEDDING_SIZE))

    kernel_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]
    num_filters = [100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160]

    l_pool = []
    for i in range(len(kernel_sizes)):
        l_conv = layers.Conv1D(num_filters[i],
                               kernel_size=kernel_sizes[i],
                               padding='valid',
                               kernel_initializer='normal',
                               activation='relu')(l_in2)
        l_maxpool = layers.Lambda(lambda x: tf.reduce_max(x, axis=1))(l_conv)
        l_pool.append(l_maxpool)

    l_cnn = layers.Concatenate(axis=1)(l_pool)
    l_cnn_drop = layers.Dropout(rate=0.25)(l_cnn)

    #dense part
    l_dense = layers.Dense(N_HIDDEN_CNN, activation='relu')(l_cnn_drop)

    #https://github.com/ParikhKadam/Highway-Layer-Keras
    transform_gate = layers.Dense(
        units=N_HIDDEN_CNN,
        activation="sigmoid",
        bias_initializer=tf.keras.initializers.Constant(-1))(l_dense)

    carry_gate = layers.Lambda(lambda x: 1.0 - x,
                               output_shape=(N_HIDDEN_CNN, ))(transform_gate)
    transformed_data = layers.Dense(units=N_HIDDEN_CNN,
                                    activation="relu")(l_dense)
    transformed_gated = layers.Multiply()([transform_gate, transformed_data])
    identity_gated = layers.Multiply()([carry_gate, l_dense])

    l_highway = layers.Add()([transformed_gated, identity_gated])

    if nettype == "regression":
        l_out = layers.Dense(1, activation='linear',
                             name="Regression")(l_highway)
        mdl = tf.keras.Model([l_in2], l_out)
        mdl.compile(optimizer='adam', loss='mse', metrics=['mse'])
    else:
        l_out = layers.Dense(2, activation='softmax',
                             name="Classification")(l_highway)
        mdl = tf.keras.Model([l_in2], l_out)
        mdl.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['acc'])

    K.set_value(mdl.optimizer.lr, 1.0e-4)

    encoder = tf.keras.Model([l_in, l_mask], l_encoder)
    encoder.compile(optimizer='adam', loss='mse')
    encoder.set_weights(np.load("embeddings.npy", allow_pickle=True))

    return mdl, encoder
예제 #18
0
def buildNetwork(n_block, n_self):
    print("Building network ...")

    # product
    l_in = layers.Input(shape=(None, ))
    l_mask = layers.Input(shape=(None, ))

    # reagents
    l_dec = layers.Input(shape=(None, ))
    l_dmask = layers.Input(shape=(None, ))

    # positional encodings for product and reagents, respectively
    l_pos = PositionLayer(EMBEDDING_SIZE)(l_mask)
    l_dpos = PositionLayer(EMBEDDING_SIZE)(l_dmask)

    l_emask = MaskLayerRight()([l_dmask, l_mask])
    l_right_mask = MaskLayerTriangular()(l_dmask)
    l_left_mask = MaskLayerLeft()(l_mask)

    # encoder
    l_voc = layers.Embedding(input_dim=vocab_size,
                             output_dim=EMBEDDING_SIZE,
                             input_length=None)

    l_embed = layers.Add()([l_voc(l_in), l_pos])
    l_embed = layers.Dropout(rate=0.1)(l_embed)

    for layer in range(n_block):
        # self attention
        l_o = [
            SelfLayer(EMBEDDING_SIZE,
                      KEY_SIZE)([l_embed, l_embed, l_embed, l_left_mask])
            for _ in range(n_self)
        ]

        l_con = layers.Concatenate()(l_o)
        l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con)
        l_drop = layers.Dropout(rate=0.1)(l_dense)
        l_add = layers.Add()([l_drop, l_embed])
        l_att = LayerNormalization()(l_add)

        # position-wise
        l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att)
        l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1)
        l_drop = layers.Dropout(rate=0.1)(l_c2)
        l_ff = layers.Add()([l_att, l_drop])
        l_embed = LayerNormalization()(l_ff)

    # bottleneck
    l_encoder = l_embed

    l_embed = layers.Add()([l_voc(l_dec), l_dpos])
    l_embed = layers.Dropout(rate=0.1)(l_embed)

    for layer in range(n_block):
        # self attention
        l_o = [
            SelfLayer(EMBEDDING_SIZE,
                      KEY_SIZE)([l_embed, l_embed, l_embed, l_right_mask])
            for i in range(n_self)
        ]

        l_con = layers.Concatenate()(l_o)
        l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con)
        l_drop = layers.Dropout(rate=0.1)(l_dense)
        l_add = layers.Add()([l_drop, l_embed])
        l_att = LayerNormalization()(l_add)

        # attention to the encoder
        l_o = [
            SelfLayer(EMBEDDING_SIZE,
                      KEY_SIZE)([l_att, l_encoder, l_encoder, l_emask])
            for i in range(n_self)
        ]
        l_con = layers.Concatenate()(l_o)
        l_dense = layers.TimeDistributed(layers.Dense(EMBEDDING_SIZE))(l_con)
        l_drop = layers.Dropout(rate=0.1)(l_dense)
        l_add = layers.Add()([l_drop, l_att])
        l_att = LayerNormalization()(l_add)

        # position-wise
        l_c1 = layers.Conv1D(N_HIDDEN, 1, activation='relu')(l_att)
        l_c2 = layers.Conv1D(EMBEDDING_SIZE, 1)(l_c1)
        l_drop = layers.Dropout(rate=0.1)(l_c2)
        l_ff = layers.Add()([l_att, l_drop])
        l_embed = LayerNormalization()(l_ff)

    l_out = layers.TimeDistributed(layers.Dense(vocab_size,
                                                use_bias=False))(l_embed)

    mdl = tf.keras.Model([l_in, l_mask, l_dec, l_dmask], l_out)

    def masked_loss(y_true, y_pred):
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true,
                                                          logits=y_pred)
        mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32')
        loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1)
        loss = K.mean(loss)
        return loss

    def masked_acc(y_true, y_pred):
        mask = tf.cast(tf.not_equal(tf.reduce_sum(y_true, -1), 0), 'float32')
        eq = K.cast(
            K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)),
            'float32')
        eq = tf.reduce_sum(eq * mask, -1) / tf.reduce_sum(mask, -1)
        eq = K.mean(eq)
        return eq

    mdl.compile(optimizer='adam',
                loss=masked_loss,
                metrics=['accuracy', masked_acc])
    mdl.summary()

    # Divide the graph for faster execution. First, we calculate encoder's values.
    # Then we use encoder's values and the product mask as additional decoder's input.
    def mdl_encoder(product):
        v = gen_left([product])
        enc = l_encoder.eval(feed_dict={l_in: v[0], l_mask: v[1], l_pos: v[2]})
        return enc, v[1]

    # And the decoder
    def mdl_decoder(res, product_encoded, product_mask, t=1.0):
        v = gen_right([res])
        d = l_out.eval(
            feed_dict={
                l_encoder: product_encoded,
                l_dec: v[0],
                l_dmask: v[1],
                l_mask: product_mask,
                l_dpos: v[2]
            })
        prob = d[0, len(res), :] / t
        prob = np.exp(prob) / np.sum(np.exp(prob))
        return prob

    return mdl, mdl_encoder, mdl_decoder
예제 #19
0
class LayerNormLSTMCell(mx.rnn.LSTMCell):
    """
    Long-Short Term Memory (LSTM) network cell with layer normalization across gates.
    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)

    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
    :param prefix: prefix for name of layers (and name of weight if params is None).
    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
    :param forget_bias: bias added to forget gate, default 1.0. Jozefowicz et al. 2015 recommends setting this to 1.0.
    :param norm_scale: scale/gain for layer normalization.
    :param norm_shift: shift/bias after layer normalization.
    """
    def __init__(self,
                 num_hidden: int,
                 prefix: str = 'lnlstm_',
                 params: Optional[mx.rnn.RNNParams] = None,
                 forget_bias: float = 1.0,
                 norm_scale: float = 1.0,
                 norm_shift: float = 0.0) -> None:
        super(LayerNormLSTMCell, self).__init__(num_hidden, prefix, params,
                                                forget_bias)
        self._iN = LayerNormalization(
            num_hidden=num_hidden * 4,
            prefix="%si2h" % self._prefix,
            scale=self.params.get('i2h_scale',
                                  shape=(num_hidden * 4, ),
                                  init=mx.init.Constant(value=norm_scale)),
            shift=self.params.get('i2h_shift',
                                  shape=(num_hidden * 4, ),
                                  init=mx.init.Constant(value=norm_shift)))
        self._hN = LayerNormalization(
            num_hidden=num_hidden * 4,
            prefix="%sh2h" % self._prefix,
            scale=self.params.get('h2h_scale',
                                  shape=(num_hidden * 4, ),
                                  init=mx.init.Constant(value=norm_scale)),
            shift=self.params.get('h2h_shift',
                                  shape=(num_hidden * 4, ),
                                  init=mx.init.Constant(value=norm_shift)))
        self._cN = LayerNormalization(
            num_hidden=num_hidden,
            prefix="%sc" % self._prefix,
            scale=self.params.get('c_scale',
                                  shape=(num_hidden, ),
                                  init=mx.init.Constant(value=norm_scale)),
            shift=self.params.get('c_shift',
                                  shape=(num_hidden, ),
                                  init=mx.init.Constant(value=norm_shift)))
        self._shape_fix = None

    def __call__(self, inputs, states):
        self._counter += 1
        name = '%st%d_' % (self._prefix, self._counter)
        i2h = mx.sym.FullyConnected(data=inputs,
                                    weight=self._iW,
                                    bias=self._iB,
                                    num_hidden=self._num_hidden * 4,
                                    name='%si2h' % name)
        if self._counter == 0:
            self._shape_fix = mx.sym.zeros_like(i2h)
        else:
            assert self._shape_fix is not None
        h2h = mx.sym.FullyConnected(data=states[0],
                                    weight=self._hW,
                                    bias=self._hB,
                                    num_hidden=self._num_hidden * 4,
                                    name='%sh2h' % name)
        gates = self._iN.normalize(i2h) + self._hN.normalize(self._shape_fix +
                                                             h2h)
        # pylint: disable=unbalanced-tuple-unpacking
        in_gate, forget_gate, in_transform, out_gate = mx.sym.split(
            gates, num_outputs=4, axis=1, name="%sslice" % name)
        in_gate = mx.sym.Activation(in_gate,
                                    act_type="sigmoid",
                                    name='%si' % name)
        forget_gate = mx.sym.Activation(forget_gate,
                                        act_type="sigmoid",
                                        name='%sf' % name)
        in_transform = mx.sym.Activation(in_transform,
                                         act_type="tanh",
                                         name='%sc' % name)
        out_gate = mx.sym.Activation(out_gate,
                                     act_type="sigmoid",
                                     name='%so' % name)
        next_c = mx.sym._internal._plus(forget_gate * states[1],
                                        in_gate * in_transform,
                                        name='%sstate' % name)
        next_h = mx.sym._internal._mul(out_gate,
                                       mx.sym.Activation(
                                           self._cN.normalize(next_c),
                                           act_type="tanh"),
                                       name='%sout' % name)
        return next_h, [next_h, next_c]
예제 #20
0
    def __init__(self,
                 num_token,
                 num_layer,
                 num_head,
                 embedding_dim,
                 attention_head_dim,
                 feed_forward_dim,
                 target_len,
                 is_training,
                 memory_len=None,
                 dropout=0.0,
                 attention_dropout=0.0,
                 attention_type=None,
                 shared_biases=True):
        self.num_layer = num_layer
        self.dropout = dropout
        self.attention_dropout = attention_dropout

        self.token_embed = keras.layers.Embedding(input_dim=num_token,
                                                  output_dim=embedding_dim,
                                                  name='Embed-Token')

        initializer = keras.initializers.get('normal')
        initializer.__setattr__("stddev", 0.02)

        self.segment_ids_trans = keras.layers.Lambda(lambda x: K.transpose(x))
        self.segment_mat_embed = RelativeSegmentEmbedding(
            num_layer=num_layer,
            num_head=num_head,
            attention_dim=attention_head_dim,
            initializer=initializer,
            name='Embed-Segment')

        self.relative_bias = RelativeBias(
            num_layer=num_layer,
            num_head=num_head,
            attention_head_dim=attention_head_dim,
            bias_initializer=initializer,
            name='Relative-Bias')

        self.attention = []
        self.attention_add = []
        self.attention_layer_norm = []
        self.feed_forward = []
        self.feed_forward_add = []
        self.feed_forward_layer_norm = []
        for i in range(num_layer):
            self.attention.append(
                RelativeMultiHeadAttention(
                    num_head=num_head,
                    attention_head_dim=attention_head_dim,
                    embedding_dim=embedding_dim,
                    dropout=dropout,
                    dropatt=attention_dropout,
                    is_training=is_training,
                    initializer=initializer,
                    name='Attention-{}'.format(i + 1),
                ))

            self.attention_add.append(
                tf.keras.layers.Add(name='Attention-Residual-{}'.format(i +
                                                                        1)))
            self.attention_layer_norm.append(
                LayerNormalization(name='Attention-Normal-{}'.format(i + 1)))

            self.feed_forward.append(
                FeedForward(feed_forward_dim=feed_forward_dim,
                            embedding_dim=embedding_dim,
                            dropout_rate=dropout,
                            kernel_initializer=initializer,
                            activation=gelu,
                            name='FeedForward-{}'.format(i + 1)))
            self.feed_forward_add.append(
                tf.keras.layers.Add(name='FeedForward-Residual-{}'.format(i +
                                                                          1)))
            self.feed_forward_layer_norm.append(
                LayerNormalization(name='FeedForward-Normal-{}'.format(i + 1)))
        self.xlnet_loss = XLnetLoss(d_model=embedding_dim,
                                    seq_len=target_len,
                                    kernel_initializer=initializer,
                                    name="XLNET_LOSS")
예제 #21
0
def build_transformer_xl(units,
                         embed_dim,
                         hidden_dim,
                         num_token,
                         num_block,
                         num_head,
                         batch_size,
                         memory_len,
                         target_len,
                         dropout=0.0,
                         attention_dropout=0.0,
                         cutoffs=None,
                         div_val=1,
                         force_projection=None,
                         bind_embeddings=True,
                         bind_projections=True,
                         clamp_len=None,
                         share_biases=True):
    """Build transformer-XL model.

    :param units: Units inside the transformer.
    :param embed_dim: Dimension of embeddings.
    :param hidden_dim: Dimension inside position-wise feed-forward layer.
    :param num_token: Number of distinct input tokens.
    :param num_block: Number of basic encoder blocks.
    :param num_head: Number of heads for attention.
    :param batch_size: Maximum batch size.
    :param memory_len: The maximum length of memories.
    :param target_len: The length of prediction block.
    :param dropout: General dropout rate.
    :param attention_dropout: Dropout rate inside attention layer.
    :param cutoffs: Cutoffs of adaptive embedding.
    :param div_val: Scale factor of adaptive embedding.
    :param force_projection: Add projection when the dimensions are equal.
    :param bind_embeddings: Whether to bind embeddings to adaptive softmax.
    :param bind_projections: Whether to bind projections to adaptive softmax.
    :param clamp_len: The maximum value of relative position.
    :param share_biases: Whether to use the same biases for all layers.
    :return: The built model.
    """
    token_input = keras.layers.Input(shape=(target_len, ), name='Input-Token')
    memory_length_input = keras.layers.Input(shape=(1, ),
                                             name='Input-Memory-Length')
    inputs = [token_input, memory_length_input]

    results = AdaptiveEmbedding(
        input_dim=num_token,
        output_dim=units,
        embed_dim=embed_dim,
        cutoffs=cutoffs,
        div_val=div_val,
        mask_zero=True,
        force_projection=force_projection,
        return_embeddings=True,
        return_projections=True,
        name='Embed-Token',
    )(token_input)
    token_embed, embedding_weights = results[0], results[1:]
    token_embed = Scale(scale=np.sqrt(units),
                        name='Embed-Token-Scaled')(token_embed)
    last_memory = Memory(
        batch_size=batch_size,
        memory_len=memory_len,
        target_len=target_len,
        output_dim=units,
        name='Memory-0',
    )([token_embed, memory_length_input])

    position_embed = PositionalEmbedding(
        output_dim=units,
        clamp_len=clamp_len,
        name='Embed-Position',
    )([token_input, last_memory])

    if 0.0 < dropout < 1.0:
        token_embed = keras.layers.Dropout(
            rate=dropout, name='Embed-Token-Dropped')(token_embed)
        position_embed = keras.layers.Dropout(
            rate=dropout, name='Embed-Position-Dropped')(position_embed)

    context_bias, relative_bias = None, None
    if share_biases:
        context_bias, relative_bias = RelativeBias(units=units,
                                                   name='Biases')(last_memory)

    outputs = [token_embed]
    for i in range(num_block):
        block_input, block_output = outputs[-1], outputs[-1]
        if not share_biases:
            context_bias, relative_bias = RelativeBias(
                units=units, name='Biases-{}'.format(i + 1))(last_memory)
        block_output = RelativePartialMultiHeadSelfAttention(
            units=units,
            num_head=num_head,
            use_bias=False,
            attention_dropout=attention_dropout,
            name='Attention-{}'.format(i + 1),
        )([
            block_output, position_embed, last_memory, context_bias,
            relative_bias
        ])
        block_output = keras.layers.Add(name='Attention-Res-{}'.format(i + 1))(
            [block_input, block_output])
        if 0.0 < dropout < 1.0:
            block_output = keras.layers.Dropout(
                rate=dropout,
                name='Attention-Dropped-{}'.format(i + 1))(block_output)
        block_output = LayerNormalization(
            name='Attention-Norm-{}'.format(i + 1))(block_output)

        block_input = block_output
        block_output = FeedForward(
            units=hidden_dim,
            dropout_rate=dropout,
            name='FeedForward-{}'.format(i + 1),
        )(block_output)
        block_output = keras.layers.Add(
            name='FeedForward-Res-{}'.format(i +
                                             1))([block_input, block_output])
        if 0.0 < dropout < 1.0:
            block_output = keras.layers.Dropout(
                rate=dropout,
                name='FeedForward-Dropped-{}'.format(i + 1))(block_output)
        block_output = LayerNormalization(
            name='FeedForward-Norm-{}'.format(i + 1))(block_output)

        if i < num_block - 1:
            last_memory = Memory(
                batch_size=batch_size,
                memory_len=memory_len,
                target_len=target_len,
                output_dim=units,
                name='Memory-{}'.format(i + 1),
            )([block_output, memory_length_input])

        outputs.append(block_output)

    softmax = AdaptiveSoftmax(
        input_dim=units,
        output_dim=num_token,
        embed_dim=embed_dim,
        cutoffs=cutoffs,
        div_val=div_val,
        force_projection=force_projection,
        bind_embeddings=bind_embeddings,
        bind_projections=bind_projections,
        name='Softmax',
    )(outputs[-1:] + embedding_weights)

    model = keras.models.Model(inputs=inputs, outputs=softmax)
    return model
예제 #22
0
class LayerNormGRUCell(mx.rnn.GRUCell):
    """
    Gated Recurrent Unit (GRU) network cell with layer normalization across gates.
    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)

    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
    :param prefix: prefix for name of layers (and name of weight if params is None).
    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
    :param norm_scale: scale/gain for layer normalization.
    :param norm_shift: shift/bias after layer normalization.
    """
    def __init__(self,
                 num_hidden: int,
                 prefix: str = 'lngru_',
                 params: Optional[mx.rnn.RNNParams] = None,
                 norm_scale: float = 1.0,
                 norm_shift: float = 0.0) -> None:
        super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params)
        self._iN = LayerNormalization(
            num_hidden=num_hidden * 3,
            prefix="%si2h" % self._prefix,
            scale=self.params.get('i2h_scale',
                                  shape=(num_hidden * 3, ),
                                  init=mx.init.Constant(value=norm_scale)),
            shift=self.params.get('i2h_shift',
                                  shape=(num_hidden * 3, ),
                                  init=mx.init.Constant(value=norm_shift)))
        self._hN = LayerNormalization(
            num_hidden=num_hidden * 3,
            prefix="%sh2h" % self._prefix,
            scale=self.params.get('h2h_scale',
                                  shape=(num_hidden * 3, ),
                                  init=mx.init.Constant(value=norm_scale)),
            shift=self.params.get('h2h_shift',
                                  shape=(num_hidden * 3, ),
                                  init=mx.init.Constant(value=norm_shift)))
        self._shape_fix = None

    def __call__(self, inputs, states):
        self._counter += 1

        seq_idx = self._counter
        name = '%st%d_' % (self._prefix, seq_idx)
        prev_state_h = states[0]

        i2h = mx.sym.FullyConnected(data=inputs,
                                    weight=self._iW,
                                    bias=self._iB,
                                    num_hidden=self._num_hidden * 3,
                                    name="%s_i2h" % name)
        h2h = mx.sym.FullyConnected(data=prev_state_h,
                                    weight=self._hW,
                                    bias=self._hB,
                                    num_hidden=self._num_hidden * 3,
                                    name="%s_h2h" % name)
        if self._counter == 0:
            self._shape_fix = mx.sym.zeros_like(i2h)
        else:
            assert self._shape_fix is not None

        i2h = self._iN.normalize(i2h)
        h2h = self._hN.normalize(self._shape_fix + h2h)

        # pylint: disable=unbalanced-tuple-unpacking
        i2h_r, i2h_z, i2h = mx.sym.split(i2h,
                                         num_outputs=3,
                                         name="%s_i2h_slice" % name)
        h2h_r, h2h_z, h2h = mx.sym.split(h2h,
                                         num_outputs=3,
                                         name="%s_h2h_slice" % name)

        reset_gate = mx.sym.Activation(i2h_r + h2h_r,
                                       act_type="sigmoid",
                                       name="%s_r_act" % name)
        update_gate = mx.sym.Activation(i2h_z + h2h_z,
                                        act_type="sigmoid",
                                        name="%s_z_act" % name)

        next_h_tmp = mx.sym.Activation(i2h + reset_gate * h2h,
                                       act_type="tanh",
                                       name="%s_h_act" % name)

        next_h = mx.sym._internal._plus((1. - update_gate) * next_h_tmp,
                                        update_gate * prev_state_h,
                                        name='%sout' % name)

        return next_h, [next_h]