def logits(self) -> tf.Tensor: vocabulary_size = len(self.vocabulary) encoder_states = self.encoder.temporal_states weights = get_variable( name="state_to_word_W", shape=[encoder_states.shape[2], vocabulary_size + 1], initializer=tf.random_uniform_initializer(-0.5, 0.5)) biases = get_variable( name="state_to_word_b", shape=[vocabulary_size + 1], initializer=tf.zeros_initializer()) # To multiply 3-D matrix (encoder hidden states) by a 2-D matrix # (weights), we use 1-by-1 convolution (similar trick can be found in # attention computation) encoder_states = tf.expand_dims(encoder_states, 2) weights_4d = tf.expand_dims(tf.expand_dims(weights, 0), 0) multiplication = tf.nn.conv2d( encoder_states, weights_4d, [1, 1, 1, 1], "SAME") multiplication_3d = tf.squeeze(multiplication, axis=2) biases_3d = tf.expand_dims(tf.expand_dims(biases, 0), 0) logits = multiplication_3d + biases_3d return tf.transpose(logits, perm=[1, 0, 2]) # time major
def __init__(self, name: str, dimension: int, data_id: str, output_shape: int = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: ModelPart.__init__(self, name, save_checkpoint, load_checkpoint, initializers) check_argument_types() if dimension <= 0: raise ValueError("Input vector dimension must be postive.") if output_shape is not None and output_shape <= 0: raise ValueError("Output vector dimension must be postive.") self.vector = tf.placeholder( tf.float32, shape=[None, dimension]) self.data_id = data_id with self.use_scope(): if output_shape is not None and dimension != output_shape: project_w = get_variable( shape=[dimension, output_shape], name="img_init_proj_W") project_b = get_variable( name="img_init_b", shape=[output_shape], initializer=tf.zeros_initializer()) self._encoded = tf.matmul( self.vector, project_w) + project_b else: self._encoded = self.vector
def output(self) -> tf.Tensor: pooled_outputs = [] for filter_size, num_filters in self.filters: with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, self.embedding_size, num_filters] w_filter = get_variable( "conv_W", filter_shape, initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) b_filter = get_variable( "conv_bias", [num_filters], initializer=tf.zeros_initializer()) conv = tf.nn.conv1d( self.embedded_inputs, w_filter, stride=1, padding="VALID", name="conv") # Apply nonlinearity conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter)) # Max-pooling over the outputs pooled = tf.reduce_max(conv_relu, 1) pooled_outputs.append(pooled) # Combine all the pooled features return tf.concat(pooled_outputs, axis=1)
def __init__(self, name: str, input_shape: List[int], output_shape: int, data_id: str, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None, initializers: InitializerSpecs = None) -> None: check_argument_types() ModelPart.__init__(self, name, save_checkpoint, load_checkpoint, initializers) assert len(input_shape) == 3 if output_shape <= 0: raise ValueError("Output vector dimension must be postive.") self.data_id = data_id with self.use_scope(): features_shape = [None] + input_shape # type: ignore self.image_features = tf.placeholder(tf.float32, shape=features_shape, name="image_input") self.flat = tf.reduce_mean(self.image_features, axis=[1, 2], name="average_image") self.project_w = get_variable( name="img_init_proj_W", shape=[input_shape[2], output_shape], initializer=tf.glorot_normal_initializer()) self.project_b = get_variable( name="img_init_b", shape=[output_shape], initializer=tf.zeros_initializer())
def get_encoder_projections(self, scope): encoder_projections = [] with tf.variable_scope(scope): for i, encoder_tensor in enumerate(self._encoders_tensors): encoder_state_size = encoder_tensor.get_shape()[2].value encoder_tensor_shape = tf.shape(encoder_tensor) proj_matrix = get_variable( "proj_matrix_{}".format(i), [encoder_state_size, self.attention_state_size], initializer=tf.random_normal_initializer(stddev=0.001)) proj_bias = get_variable("proj_bias_{}".format(i), shape=[self.attention_state_size], initializer=tf.zeros_initializer()) encoder_tensor_2d = tf.reshape(encoder_tensor, [-1, encoder_state_size]) projected_2d = tf.matmul(encoder_tensor_2d, proj_matrix) + proj_bias assert_shape(projected_2d, [-1, self.attention_state_size]) projection = tf.reshape(projected_2d, [ encoder_tensor_shape[0], encoder_tensor_shape[1], self.attention_state_size ]) encoder_projections.append(projection) return encoder_projections
def get_encoder_projections(self, scope: str) -> List[tf.Tensor]: encoder_projections = [] with tf.variable_scope(scope): for i, encoder_tensor in enumerate(self._encoders_tensors): encoder_state_size = encoder_tensor.get_shape()[2].value encoder_tensor_shape = tf.shape(encoder_tensor) proj_matrix = get_variable( "proj_matrix_{}".format(i), [encoder_state_size, self.attention_state_size], initializer=tf.random_normal_initializer(stddev=0.001)) proj_bias = get_variable( "proj_bias_{}".format(i), shape=[self.attention_state_size], initializer=tf.zeros_initializer()) encoder_tensor_2d = tf.reshape( encoder_tensor, [-1, encoder_state_size]) projected_2d = tf.matmul( encoder_tensor_2d, proj_matrix) + proj_bias assert_shape(projected_2d, [-1, self.attention_state_size]) projection = tf.reshape( projected_2d, [encoder_tensor_shape[0], encoder_tensor_shape[1], self.attention_state_size]) encoder_projections.append(projection) return encoder_projections
def logits(self) -> tf.Tensor: vocabulary_size = len(self.vocabulary) encoder_states = self.encoder.temporal_states weights = get_variable( name="state_to_word_W", shape=[encoder_states.shape[2], vocabulary_size + 1], initializer=tf.random_uniform_initializer(-0.5, 0.5)) biases = get_variable(name="state_to_word_b", shape=[vocabulary_size + 1], initializer=tf.zeros_initializer()) # To multiply 3-D matrix (encoder hidden states) by a 2-D matrix # (weights), we use 1-by-1 convolution (similar trick can be found in # attention computation) encoder_states = tf.expand_dims(encoder_states, 2) weights_4d = tf.expand_dims(tf.expand_dims(weights, 0), 0) multiplication = tf.nn.conv2d(encoder_states, weights_4d, [1, 1, 1, 1], "SAME") multiplication_3d = tf.squeeze(multiplication, squeeze_dims=[2]) biases_3d = tf.expand_dims(tf.expand_dims(biases, 0), 0) logits = multiplication_3d + biases_3d return tf.transpose(logits, perm=[1, 0, 2]) # time major
def highway(inputs, activation=tf.nn.relu, scope="HighwayNetwork"): """Create a single highway layer. y = H(x, Wh) * T(x, Wt) + x * C(x, Wc) where: C(x, Wc) = 1 - T(x, Wt) Arguments: inputs: A tensor or list of tensors. It should be 2D tensors with equal length in the first dimension (batch size) activation: Activation function of the linear part of the formula H(x, Wh). scope: The name of the scope used for the variables. Returns: A tensor of shape tf.shape(inputs) """ with tf.variable_scope(scope): if isinstance(inputs, list): # if there is a list of tensor on the input, concatenate along # the last dimension and project. inputs = tf.concat(inputs, axis=-1) # pylint: disable=no-member vec_size = inputs.get_shape().as_list()[-1] # pylint: disable=invalid-name W_shape = [vec_size, vec_size] b_shape = [vec_size] W_H = get_variable("weight_H", shape=W_shape, initializer=tf.glorot_normal_initializer()) b_H = get_variable("bias_H", shape=b_shape, initializer=tf.constant_initializer(-1.0)) W_T = get_variable("weight_T", shape=W_shape, initializer=tf.glorot_normal_initializer()) b_T = get_variable("bias_T", shape=b_shape, initializer=tf.constant_initializer(-1.0)) T = tf.sigmoid(tf.add(tf.matmul(inputs, W_T), b_T), name="transform_gate") H = activation(tf.add(tf.matmul(inputs, W_H), b_H), name="activation") C = tf.subtract(1.0, T, name="carry_gate") y = tf.add(tf.multiply(H, T), tf.multiply(inputs, C), "y") return y
def highway(inputs, activation=tf.nn.relu, scope="HighwayNetwork"): """Create a single highway layer. y = H(x, Wh) * T(x, Wt) + x * C(x, Wc) where: C(x, Wc) = 1 - T(x, Wt) Arguments: inputs: A tensor or list of tensors. It should be 2D tensors with equal length in the first dimension (batch size) activation: Activation function of the linear part of the formula H(x, Wh). scope: The name of the scope used for the variables. Returns: A tensor of shape tf.shape(inputs) """ with tf.variable_scope(scope): if isinstance(inputs, list): # if there is a list of tensor on the input, concatenate along # the last dimension and project. inputs = tf.concat(inputs, axis=-1) vec_size = inputs.get_shape().as_list()[-1] # pylint: disable=invalid-name W_shape = [vec_size, vec_size] b_shape = [vec_size] W_H = get_variable("weight_H", shape=W_shape) b_H = get_variable("bias_H", shape=b_shape, initializer=tf.constant_initializer(-1.0)) W_T = get_variable("weight_T", shape=W_shape) b_T = get_variable("bias_T", shape=b_shape, initializer=tf.constant_initializer(-1.0)) T = tf.sigmoid( tf.add(tf.matmul(inputs, W_T), b_T), name="transform_gate") H = activation( tf.add(tf.matmul(inputs, W_H), b_H), name="activation") C = tf.subtract(1.0, T, name="carry_gate") y = tf.add( tf.multiply(H, T), tf.multiply(inputs, C), "y") return y
def order_embeddings(self) -> tf.Tensor: # initialization in the same way as in original CS2S implementation with tf.variable_scope("input_projection"): return get_variable("order_embeddings", [ self.max_input_length, self.input_sequence.embedding_sizes[0] ], initializer=tf.glorot_normal_initializer())
def _vector_logit(self, projected_decoder_state: tf.Tensor, vector_value: tf.Tensor, scope: str) -> tf.Tensor: """Get logit for a single vector, e.g., sentinel vector.""" assert_shape(projected_decoder_state, [-1, 1, -1]) assert_shape(vector_value, [-1, -1]) with tf.variable_scope("{}_logit".format(scope)): vector_bias = get_variable( "vector_bias", [], initializer=tf.zeros_initializer()) proj_vector_for_logit = tf.expand_dims( tf.layers.dense(vector_value, self.attention_state_size, name="vector_projection"), 1) if self._share_projections: proj_vector_for_ctx = proj_vector_for_logit else: proj_vector_for_ctx = tf.expand_dims( tf.layers.dense(vector_value, self.attention_state_size, name="vector_ctx_proj"), 1) vector_logit = tf.reduce_sum( self.attn_v * tf.tanh(projected_decoder_state + proj_vector_for_logit), [2]) + vector_bias assert_shape(vector_logit, [-1, 1]) return proj_vector_for_ctx, vector_logit
def decoding_b(self) -> Optional[tf.Variable]: if self.tie_embeddings: return tf.zeros(len(self.vocabulary)) with tf.name_scope("output_projection"): return get_variable("state_to_word_b", [len(self.vocabulary)], initializer=tf.zeros_initializer())
def _vector_logit(self, projected_decoder_state: tf.Tensor, vector_value: tf.Tensor, scope: str) -> tf.Tensor: """Get logit for a single vector, e.g., sentinel vector.""" assert_shape(projected_decoder_state, [-1, 1, -1]) assert_shape(vector_value, [-1, -1]) with tf.variable_scope("{}_logit".format(scope)): vector_bias = get_variable("vector_bias", [], initializer=tf.zeros_initializer()) proj_vector_for_logit = tf.expand_dims( tf.layers.dense(vector_value, self.attention_state_size, name="vector_projection"), 1) if self._share_projections: proj_vector_for_ctx = proj_vector_for_logit else: proj_vector_for_ctx = tf.expand_dims( tf.layers.dense(vector_value, self.attention_state_size, name="vector_ctx_proj"), 1) vector_logit = tf.reduce_sum( self.attn_v * tf.tanh(projected_decoder_state + proj_vector_for_logit), [2]) + vector_bias assert_shape(vector_logit, [-1, 1]) return proj_vector_for_ctx, vector_logit
def encoder_attn_biases(self) -> List[tf.Variable]: return [ get_variable(name="attn_bias_{}".format(i), shape=[], initializer=tf.zeros_initializer()) for i in range(len(self._encoders_tensors)) ]
def embedded_inputs(self) -> tf.Tensor: with tf.variable_scope("input_projection"): embedding_matrix = get_variable( "word_embeddings", [len(self.vocabulary), self.embedding_size], initializer=tf.glorot_uniform_initializer()) return dropout( tf.nn.embedding_lookup(embedding_matrix, self.inputs), self.dropout_keep_prob, self.train_mode)
def __init__(self, name: str, encoders: List[Attendable], attention_state_size: int, share_attn_projections: bool = False, use_sentinels: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: check_argument_types() MultiAttention.__init__(self, name=name, attention_state_size=attention_state_size, share_attn_projections=share_attn_projections, use_sentinels=use_sentinels, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self._encoders = encoders # pylint: disable=protected-access self._encoders_tensors = [ get_attention_states(e) for e in self._encoders ] self._encoders_masks = [get_attention_mask(e) for e in self._encoders] # pylint: enable=protected-access for e_m in self._encoders_masks: assert_shape(e_m, [-1, -1]) for e_t in self._encoders_tensors: assert_shape(e_t, [-1, -1, -1]) with self.use_scope(): self.encoder_projections_for_logits = \ self.get_encoder_projections("logits_projections") self.encoder_attn_biases = [ get_variable(name="attn_bias_{}".format(i), shape=[], initializer=tf.zeros_initializer()) for i in range(len(self._encoders_tensors)) ] if self._share_projections: self.encoder_projections_for_ctx = \ self.encoder_projections_for_logits else: self.encoder_projections_for_ctx = \ self.get_encoder_projections("context_projections") if self._use_sentinels: self._encoders_masks.append( tf.ones([tf.shape(self._encoders_masks[0])[0], 1])) self.masks_concat = tf.concat(self._encoders_masks, 1)
def decoding_b(self) -> Optional[tf.Variable]: if self.tie_embeddings: return tf.zeros(len(self.vocabulary)) with tf.name_scope("output_projection"): return get_variable( "state_to_word_b", [len(self.vocabulary)], initializer=tf.zeros_initializer())
def _residual_conv(self, input_signals: tf.Tensor, name: str): with tf.variable_scope(name): # Initialized as described in the paper. # Note: this should be equivalent to tf.glorot_normal_initializer init_deviat = np.sqrt(4 / self.conv_features) convolution_filters = get_variable( "convolution_filters", [self.kernel_width, self.conv_features, 2 * self.conv_features], initializer=tf.random_normal_initializer(stddev=init_deviat)) bias = get_variable( name="conv_bias", shape=[2 * self.conv_features], initializer=tf.zeros_initializer()) conv = (tf.nn.conv1d(input_signals, convolution_filters, 1, "SAME") + bias) return glu(conv) + input_signals
def modality_matrix(self) -> tf.Tensor: """Create an embedding matrix for varyining target modalities. Used to embed different target space modalities in the tensor2tensor models (e.g. during the zero-shot translation). """ emb_size = self.input_sequence.temporal_states.shape.as_list()[-1] return get_variable(name="target_modality_embedding_matrix", shape=[32, emb_size], dtype=tf.float32, initializer=tf.glorot_uniform_initializer())
def embedded_inputs(self) -> tf.Tensor: with tf.variable_scope("input_projection"): embedding_matrix = get_variable( "word_embeddings", [len(self.vocabulary), self.embedding_size], initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) return dropout( tf.nn.embedding_lookup(embedding_matrix, self.inputs), self.dropout_keep_prob, self.train_mode)
def embedding_matrix(self) -> tf.Variable: """Variables and operations for embedding of input words. If we are reusing word embeddings, this function takes the embedding matrix from the first encoder """ if self.embeddings_source is not None: return self.embeddings_source.embedding_matrix return get_variable(name="word_embeddings", shape=[len(self.vocabulary), self.embedding_size], initializer=tf.glorot_uniform_initializer())
def cnn_encoded(self) -> tf.Tensor: """1D convolution with max-pool that processing characters.""" dropped_inputs = dropout(self.input_sequence.temporal_states, self.dropout_keep_prob, self.train_mode) pooled_outputs = [] for filter_size, num_filters in self.filters: with tf.variable_scope("conv-maxpool-%s" % filter_size): filter_shape = [filter_size, self.input_sequence.dimension, num_filters] w_filter = get_variable( "conv_W", filter_shape, initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) b_filter = get_variable( "conv_bias", [num_filters], initializer=tf.zeros_initializer()) conv = tf.nn.conv1d( dropped_inputs, w_filter, stride=1, padding="SAME", name="conv") # Apply nonlinearity conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter)) # Max-pooling over the output segments expanded_conv_relu = tf.expand_dims(conv_relu, -1) pooled = tf.nn.max_pool( expanded_conv_relu, ksize=[1, self.segment_size, 1, 1], strides=[1, self.segment_size, 1, 1], padding="SAME", name="maxpool") pooled_outputs.append(pooled) # Combine all the pooled features concat = tf.concat(pooled_outputs, axis=2) return tf.squeeze(concat, [3])
def __init__(self, name: str, dimension: int, data_id: str, output_shape: int = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Instantiate StatefulFiller. Args: name: Name of the model part. dimension: Dimensionality of the input. data_id: Series containing the numpy objects. output_shape: Dimension of optional state projection. """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint, initializers) check_argument_types() if dimension <= 0: raise ValueError("Input vector dimension must be positive.") if output_shape is not None and output_shape <= 0: raise ValueError("Output vector dimension must be positive.") self.vector = tf.placeholder(tf.float32, shape=[None, dimension]) self.data_id = data_id with self.use_scope(): if output_shape is not None and dimension != output_shape: project_w = get_variable(shape=[dimension, output_shape], name="img_init_proj_W") project_b = get_variable(name="img_init_b", shape=[output_shape], initializer=tf.zeros_initializer()) self._encoded = tf.matmul(self.vector, project_w) + project_b else: self._encoded = self.vector
def modality_matrix(self) -> tf.Tensor: """Create an embedding matrix for varyining target modalities. Used to embed different target space modalities in the tensor2tensor models (e.g. during the zero-shot translation). """ emb_size = self.input_sequence.temporal_states.shape.as_list()[-1] return get_variable( name="target_modality_embedding_matrix", shape=[32, emb_size], dtype=tf.float32, initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))
def embedding_matrices(self) -> List[tf.Tensor]: """Return a list of embedding matrices for each factor.""" # Note: Embedding matrices are numbered rather than named by the data # id so the data_id string does not need to be the same across # experiments return [ get_variable( name="embedding_matrix_{}".format(i), shape=[vocab_size, emb_size], initializer=tf.glorot_uniform_initializer()) for i, (data_id, vocab_size, emb_size) in enumerate(zip( self.data_ids, self.vocabulary_sizes, self.embedding_sizes))]
def embedding_matrix(self) -> tf.Variable: """Variables and operations for embedding of input words. If we are reusing word embeddings, this function takes the embedding matrix from the first encoder """ if self.embeddings_source is not None: return self.embeddings_source.embedding_matrix assert self.embedding_size is not None return get_variable( name="word_embeddings", shape=[len(self.vocabulary), self.embedding_size])
def decoding_w(self) -> tf.Variable: if (self.tie_embeddings and self.embedding_size != self.output_dimension): raise ValueError( "`embedding_size must be equal to the output_projection " "size when using the `tie_embeddings` option") with tf.name_scope("output_projection"): if self.tie_embeddings: return tf.transpose(self.embedding_matrix) return get_variable( "state_to_word_W", [self.output_dimension, len(self.vocabulary)], initializer=tf.glorot_uniform_initializer())
def decoding_w(self) -> tf.Variable: if (self.tie_embeddings and self.embedding_size != self.output_dimension): raise ValueError( "`embedding_size must be equal to the output_projection " "size when using the `tie_embeddings` option") with tf.name_scope("output_projection"): if self.tie_embeddings: return tf.transpose(self.embedding_matrix) return get_variable( "state_to_word_W", [self.output_dimension, len(self.vocabulary)], initializer=tf.random_uniform_initializer(-0.5, 0.5))
def embedding_matrices(self) -> List[tf.Tensor]: """Return a list of embedding matrices for each factor.""" # Note: Embedding matrices are numbered rather than named by the data # id so the data_id string does not need to be the same across # experiments if self.embeddings_source is not None: return self.embeddings_source.embedding_matrices return [ get_variable( name="embedding_matrix_{}".format(i), shape=[vocab_size, emb_size], trainable=self.trainable) for i, (data_id, vocab_size, emb_size) in enumerate(zip( self.data_ids, self.vocabulary_sizes, self.embedding_sizes))]
def embedding_matrices(self) -> List[tf.Tensor]: """Return a list of embedding matrices for each factor.""" # Note: Embedding matrices are numbered rather than named by the data # id so the data_id string does not need to be the same across # experiments if self.embeddings_source is not None: return self.embeddings_source.embedding_matrices return [ get_variable(name="embedding_matrix_{}".format(i), shape=[vocab_size, emb_size], trainable=self.trainable) for i, (data_id, vocab_size, emb_size) in enumerate( zip(self.data_ids, self.vocabulary_sizes, self.embedding_sizes)) ]
def __init__(self, name: str, attention_state_size: int, share_attn_projections: bool = False, use_sentinels: bool = False, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: BaseAttention.__init__(self, name, save_checkpoint, load_checkpoint, initializers) self.attentions_in_time = [] # type: List[tf.Tensor] self.attention_state_size = attention_state_size self._share_projections = share_attn_projections self._use_sentinels = use_sentinels self.att_scope_name = "attention_{}".format(name) with self.use_scope(): self.attn_v = get_variable( "attn_v", [1, 1, self.attention_state_size], initializer=tf.random_normal_initializer(stddev=0.001))
def key_projection_matrix(self) -> tf.Variable: return get_variable( name="attn_key_projection", # TODO tohle neni spravne shape=[self.context_vector_size, self.state_size])
def query_projection_matrix(self) -> tf.Variable: with tf.variable_scope("Attention"): return get_variable(name="attn_query_projection", shape=[self.query_state_size, self.state_size])
def query_projection_matrix(self) -> tf.Variable: with tf.variable_scope("Attention"): return get_variable( name="attn_query_projection", shape=[self.query_state_size, self.state_size])
def coverage_weights(self) -> tf.Variable: return get_variable("coverage_matrix", [1, 1, 1, self.state_size])
def decoding_residual_w(self) -> tf.Variable: input_dim = self.encoder.input_sequence.dimension return get_variable(name="emb_to_word_W", shape=[input_dim, len(self.vocabulary)], initializer=tf.glorot_normal_initializer())
def decoding_w(self) -> tf.Variable: return get_variable(name="state_to_word_W", shape=[self.rnn_size, len(self.vocabulary)])
def bias_term(self) -> tf.Variable: return get_variable(name="attn_bias", shape=[], initializer=tf.zeros_initializer())
def encoder_attn_biases(self) -> List[tf.Variable]: return [get_variable(name="attn_bias_{}".format(i), shape=[], initializer=tf.zeros_initializer()) for i in range(len(self._encoders_tensors))]
def fertility_weights(self) -> tf.Variable: return get_variable( "fertility_matrix", [1, 1, self.context_vector_size])
def similarity_bias_vector(self) -> tf.Variable: return get_variable( name="attn_similarity_v", shape=[self.state_size])
def attn_v(self) -> tf.Tensor: return get_variable( "attn_v", [1, 1, self.attention_state_size], initializer=tf.random_normal_initializer(stddev=0.001))
def projection_bias_vector(self) -> tf.Variable: return get_variable( name="attn_projection_bias", shape=[self.state_size], initializer=tf.zeros_initializer())
def similarity_bias_vector(self) -> tf.Variable: return get_variable(name="attn_similarity_v", shape=[self.state_size])
def projection_bias_vector(self) -> tf.Variable: return get_variable(name="attn_projection_bias", shape=[self.state_size], initializer=tf.zeros_initializer())
def decoding_w(self) -> tf.Variable: return get_variable(name="state_to_word_W", shape=[self.rnn_size, len(self.vocabulary)], initializer=tf.glorot_normal_initializer())
def decoding_b(self) -> tf.Variable: return get_variable(name="state_to_word_b", shape=[len(self.vocabulary)], initializer=tf.zeros_initializer())
def bias_term(self) -> tf.Variable: return get_variable( name="attn_bias", shape=[], initializer=tf.zeros_initializer())