def __init__(self, params, is_train, mode=None): self.is_train = is_train self.params = params if mode is not None: self.mode = mode elif self.is_train: self.mode = ModeKeys.TRAIN else: self.mode = ModeKeys.PREDICT if params.shared_embedding_softmax_weights: print("sharing embedding!!!") self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( params.vocab_size, params.hidden_size) self.encoder_embedding_layer = self.embedding_softmax_layer self.decoder_embedding_layer = self.embedding_softmax_layer self.decoder_softmax_layer = self.embedding_softmax_layer else: print("not sharing embedding!!!") self.encoder_embedding_layer = embedding_layer.EmbeddingWeights( params.source_vocab_size, params.hidden_size, "source_embedding") self.decoder_embedding_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, "target_embedding") self.decoder_softmax_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, 'soft_max') # done self.encoder_stack = EncoderDecoder.EncoderStack(params, is_train, self.mode) self.decoder_stack = EncoderDecoder.DecoderStack(params, is_train, self.mode) self._initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform")
def __init__(self, params, is_train, mode=None, scope=None): """Initialize layers to build Transformer model. Args: params: hyperparameter object defining layer sizes, dropout values, etc. is_train: boolean indicating whether the model is in training mode. Used to determine if dropout layers should be added. """ self.dropout_rate = tf.placeholder_with_default(0.0, shape=[], name="dropout_rate") self.is_train = is_train self.params = params self.name_scope = scope # reset dropout rate using placeholder, # when inference, the dropout_rate is 0.0, when training is 0.1 self.params.layer_postprocess_dropout = self.dropout_rate self.params.attention_dropout = self.dropout_rate self.params.relu_dropout = self.dropout_rate if mode is not None: self.mode = mode elif self.is_train: self.mode = ModeKeys.TRAIN else: self.mode = ModeKeys.PREDICT self.initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") # done self.encoder_stack = EncoderStack(params, is_train, self.mode) self.decoder_stack = DecoderStack(params, is_train, self.mode) with tf.variable_scope(self.name_scope): if params.shared_embedding_softmax_weights: self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( params.vocab_size, params.hidden_size) self.encoder_embedding_layer = self.embedding_softmax_layer self.decoder_embedding_layer = self.embedding_softmax_layer self.decoder_softmax_layer = self.embedding_softmax_layer else: self.encoder_embedding_layer = embedding_layer.EmbeddingWeights( params.source_vocab_size, params.hidden_size, "source_embedding") self.decoder_embedding_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, "target_embedding") self.decoder_softmax_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, 'sot_max')
def init_embed(self, name_scope): with tf.variable_scope(name_scope, initializer=self._initializer, reuse=tf.AUTO_REUSE): if self.params.shared_embedding_softmax_weights: print("sharing embedding!!!") self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( self.params.vocab_size, self.params.hidden_size) self.encoder_embedding_layer = self.embedding_softmax_layer self.decoder_embedding_layer = self.embedding_softmax_layer self.decoder_softmax_layer = self.embedding_softmax_layer else: print("not sharing embedding!!!") self.encoder_embedding_layer = embedding_layer.EmbeddingWeights( self.params.source_vocab_size, self.params.hidden_size, "source_embedding") self.decoder_embedding_layer = embedding_layer.EmbeddingWeights( self.params.target_vocab_size, self.params.hidden_size, "target_embedding") self.decoder_softmax_layer = embedding_layer.EmbeddingWeights( self.params.target_vocab_size, self.params.hidden_size, 'soft_max')
def __init__(self, params, is_train, mode=None): self.is_train = is_train self.params = params if mode is not None: self.mode = mode elif self.is_train: self.mode = ModeKeys.TRAIN else: self.mode = ModeKeys.PREDICT #with tf.device('/cpu:0'): # self.dropout_pl = tf.placeholder(dtype=tf.float32, shape=[], name="dropout_pl") # self.params.layer_postprocess_dropout = self.dropout_pl # self.params.attention_dropout = self.dropout_pl # self.relu_dropout = self.dropout_pl if params.shared_embedding_softmax_weights: print("sharing embedding!!!") self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights( params.vocab_size, params.hidden_size) self.encoder_embedding_layer = self.embedding_softmax_layer self.decoder_embedding_layer = self.embedding_softmax_layer self.decoder_softmax_layer = self.embedding_softmax_layer else: print("not sharing embedding!!!") self.encoder_embedding_layer = embedding_layer.EmbeddingWeights( params.source_vocab_size, params.hidden_size, "source_embedding") self.decoder_embedding_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, "target_embedding") self.decoder_softmax_layer = embedding_layer.EmbeddingWeights( params.target_vocab_size, params.hidden_size, 'soft_max') # done self.encoder_stack = EncoderDecoder.EncoderStack(params, is_train, self.mode) self.decoder_stack = EncoderDecoder.DecoderStack(params, is_train, self.mode) self._initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform")
variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * self.scale + self.bias if __name__ == "__main__": import os tf.enable_eager_execution() os.environ["CUDA_VISIBLE_DEVICES"] = "0" params = model_params.TransformerBaseParams() x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32) Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size, params.hidden_size, "source_embedding") embedded_inputs = Enc_Embedding( x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN)) print(embedded_inputs.shape) attention_bias = model_utils.get_padding_bias(x_inputs) print(attention_bias.shape) encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN) enc_out = encoder_stack(embedded_inputs, attention_bias, None) print(enc_out.shape) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( 10) self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1] print(self_attention_bias) attention_bias = model_utils.get_padding_bias(x_inputs) cache = {