def testTransformerAutoencoder(self): hparams = imagetransformer_latent_tiny() hparams.mode = tf.estimator.ModeKeys.TRAIN block_dim = int(hparams.hidden_size // hparams.num_blocks) block_v_size = 2**(hparams.bottleneck_bits / (hparams.num_residuals * hparams.num_blocks)) block_v_size = int(block_v_size) means = tf.get_variable( name="means", shape=[hparams.num_residuals, hparams.num_blocks, block_v_size, block_dim], initializer=tf.uniform_unit_scaling_initializer()) hparams.bottleneck = functools.partial( discretization.discrete_bottleneck, hidden_size=hparams.hidden_size, z_size=hparams.bottleneck_bits, filter_size=hparams.filter_size, startup_steps=hparams.startup_steps, bottleneck_kind=hparams.bottleneck_kind, num_blocks=hparams.num_blocks, num_residuals=hparams.num_residuals, reshape_method=hparams.reshape_method, beta=hparams.vq_beta, decay=hparams.vq_decay, soft_em=hparams.soft_em, num_samples=hparams.num_samples, epsilon=hparams.vq_epsilon, ema=hparams.ema, means=means) inputs = None batch_size = hparams.batch_size targets = tf.random_uniform([batch_size, hparams.img_len, hparams.img_len, hparams.hidden_size], minval=-1., maxval=1.) target_space_id = None tf.train.create_global_step() decoder_output, losses, cache = latent_layers.transformer_autoencoder( inputs, targets, target_space_id, hparams) self.assertEqual(set(losses), {"extra", "extra_loss", "latent_pred"}) self.evaluate(tf.global_variables_initializer()) decoder_output_, extra_loss_, latent_pred_ = self.evaluate( [decoder_output, losses["extra_loss"], losses["latent_pred"]]) self.assertEqual(decoder_output_.shape, (batch_size, hparams.img_len, hparams.img_len, hparams.hidden_size)) self.assertEqual(extra_loss_.shape, (batch_size,)) self.assertEqual(latent_pred_.shape, (batch_size,)) self.assertAllGreaterEqual(extra_loss_, 0.) self.assertAllGreaterEqual(latent_pred_, 0.) self.assertEqual(cache, None)
def conv1d(x, num_filters, filter_length, name, dilation=1, causal=True, kernel_initializer=tf.uniform_unit_scaling_initializer(1.0), biases_initializer=tf.constant_initializer(0.0), is_training=True): """Fast 1D convolution that supports causal padding and dilation. Args: x: The [mb, time, channels] float tensor that we convolve. num_filters: The number of filter maps in the convolution. filter_length: The integer length of the filter. name: The name of the scope for the variables. dilation: The amount of dilation. causal: Whether or not this is a causal convolution. kernel_initializer: The kernel initialization function. biases_initializer: The biases initialization function. is_training: Whether or not ot use traininable variables. Returns: y: The output of the 1D convolution. """ batch_size, length, num_input_channels = x.get_shape().as_list() assert length % dilation == 0 kernel_shape = [1, filter_length, num_input_channels, num_filters] strides = [1, 1, 1, 1] biases_shape = [num_filters] padding = 'VALID' if causal else 'SAME' with tf.variable_scope(name): weights = tf.get_variable('W', shape=kernel_shape, initializer=kernel_initializer, trainable=is_training) biases = tf.get_variable('biases', shape=biases_shape, initializer=biases_initializer, trainable=is_training) x_ttb = time_to_batch(x, dilation) if filter_length > 1 and causal: x_ttb = tf.pad(x_ttb, [[0, 0], [filter_length - 1, 0], [0, 0]]) x_ttb_shape = x_ttb.get_shape().as_list() x_4d = tf.reshape(x_ttb, [x_ttb_shape[0], 1, x_ttb_shape[1], num_input_channels]) y = tf.nn.conv2d(x_4d, weights, strides, padding=padding) y = tf.nn.bias_add(y, biases) y_shape = y.get_shape().as_list() y = tf.reshape(y, [y_shape[0], y_shape[2], num_filters]) y = batch_to_time(y, dilation) y.set_shape([batch_size, length, num_filters]) return y
def _fully_connected(self, x, out_dim): """FullyConnected layer for final output.""" num_non_batch_dimensions = len(x.shape) prod_non_batch_dimensions = 1 for ii in range(num_non_batch_dimensions - 1): prod_non_batch_dimensions *= int(x.shape[ii + 1]) x = tf.reshape(x, [tf.shape(x)[0], -1]) w = tf.get_variable( 'DW', [prod_non_batch_dimensions, out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable('biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)
def init_vq_bottleneck(bottleneck_size, hidden_size): """Get lookup table for VQ bottleneck.""" means = tf.get_variable(name="means", shape=[bottleneck_size, hidden_size], initializer=tf.uniform_unit_scaling_initializer()) ema_count = tf.get_variable(name="ema_count", shape=[bottleneck_size], initializer=tf.constant_initializer(0), trainable=False) with tf.colocate_with(means): ema_means = tf.get_variable(name="ema_means", initializer=means.initialized_value(), trainable=False) return means, ema_means, ema_count
def __init__(self, embedding_dim, num_embeddings, commitment_cost, name='vq_layer'): super(VectorQuantizer, self).__init__(name=name) self._embedding_dim = embedding_dim self._num_embeddings = num_embeddings self._commitment_cost = commitment_cost with self._enter_variable_scope(): initializer = tf.uniform_unit_scaling_initializer() self._w = tf.get_variable('embedding', [embedding_dim, num_embeddings], initializer=initializer, trainable=True)
def uniform_scaling(shape=None, factor=1.0, dtype=tf.float32, seed=None): """ Uniform Scaling. Initialization with random values from uniform distribution without scaling variance. When initializing a deep network, it is in principle advantageous to keep the scale of the input variance constant, so it does not explode or diminish by reaching the final layer. If the input is `x` and the operation `x * W`, and we want to initialize `W` uniformly at random, we need to pick `W` from [-sqrt(3) / sqrt(dim), sqrt(3) / sqrt(dim)] to keep the scale intact, where `dim = W.shape[0]` (the size of the input). A similar calculation for convolutional networks gives an analogous result with `dim` equal to the product of the first 3 dimensions. When nonlinearities are present, we need to multiply this by a constant `factor`. See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558) ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments and the calculation of constants. In section 2.3 there, the constants were numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15. Arguments: shape: List of `int`. A shape to initialize a Tensor (optional). factor: `float`. A multiplicative factor by which the values will be scaled. dtype: The tensor data type. Only float are supported. seed: `int`. Used to create a random seed for the distribution. Returns: The Initializer, or an initialized `Tensor` if shape is specified. """ if shape: input_size = 1.0 for dim in shape[:-1]: input_size *= float(dim) max_val = math.sqrt(3 / input_size) * factor return tf.random_ops.random_uniform(shape, -max_val, max_val, dtype, seed=seed) else: return tf.uniform_unit_scaling_initializer(seed=seed, dtype=dtype)
def __init__(self, *args, **kwargs): super(TransformerAE, self).__init__(*args, **kwargs) self.predict_mask = 1.0 # Define bottleneck function self._hparams.bottleneck = functools.partial( discretization.discrete_bottleneck, hidden_size=self._hparams.hidden_size, z_size=self._hparams.z_size, filter_size=self._hparams.filter_size, bottleneck_kind=self._hparams.bottleneck_kind, num_blocks=self._hparams.num_blocks, num_residuals=self.hparams.num_residuals, reshape_method=self._hparams.reshape_method, beta=self._hparams.beta, ema=self._hparams.ema, epsilon=self._hparams.epsilon, decay=self._hparams.decay, random_top_k=self._hparams.random_top_k, soft_em=self.hparams.soft_em, num_samples=self.hparams.num_samples, softmax_k=self._hparams.softmax_k, temperature_warmup_steps=self._hparams.temperature_warmup_steps, do_hard_gumbel_softmax=self._hparams.do_hard_gumbel_softmax, num_flows=self._hparams.num_flows, approximate_gs_entropy=self._hparams.approximate_gs_entropy, discrete_mix=self._hparams.d_mix, noise_dev=self._hparams.noise_dev, startup_steps=self.hparams.startup_steps, summary=_DO_SUMMARIES) # Set the discretization bottleneck specific things here if self._hparams.bottleneck_kind in ["dvq", "gumbel-softmax-dvq"]: z_size_per_residual = self._hparams.z_size / self._hparams.num_residuals block_dim = int(self._hparams.hidden_size // self._hparams.num_blocks) block_v_size = 2**(z_size_per_residual / self._hparams.num_blocks) block_v_size = int(block_v_size) if self._hparams.reshape_method == "project": tf.logging.info("Using projections for DVQ") tf.logging.info("Trainable projections = {}".format( self._hparams.trainable_projections)) projection_tensors = tf.get_variable( name="projection", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, self._hparams.hidden_size, block_dim ], initializer=tf.initializers.glorot_uniform(), trainable=self._hparams.trainable_projections) self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, projection_tensors=projection_tensors) elif self._hparams.reshape_method == "slice": tf.logging.info("Using slices for DVQ") else: raise ValueError("Unknown reshape method") means = tf.get_variable( name="means", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, block_v_size, block_dim ], initializer=tf.uniform_unit_scaling_initializer()) # Create the shadow variables if we are using EMA ema_count = None ema_means = None if self._hparams.ema: ema_count = [] for i in range(self._hparams.num_residuals): ema_count_i = tf.get_variable( "ema_count_{}".format(i), [self._hparams.num_blocks, block_v_size], initializer=tf.constant_initializer(0), trainable=False) ema_count.append(ema_count_i) with tf.colocate_with(means): ema_means = [] for i in range(self._hparams.num_residuals): ema_means_i = tf.get_variable( "ema_means_{}".format(i), [ self._hparams.num_blocks, block_v_size, block_dim ], initializer=( lambda shape, dtype=None, partition_info=None, # pylint: disable=g-long-lambda verify_shape=None: means.initialized_value()[i] ), trainable=False) ema_means.append(ema_means_i) # Update bottleneck self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, means=means, ema_count=ema_count, ema_means=ema_means)
def encoder(features, mode, vocab, hps): """Model function. Atttention seq2seq model, augmented with an encoder over the targets of the nearest neighbors. Args: features: Dictionary of input Tensors. mode: train or eval. Keys from tf.estimator.ModeKeys. vocab: A list of strings of words in the vocabulary. hps: Hyperparams. Returns: Encoder outputs. """ # [batch_size, src_len] src_inputs = features["src_inputs"] src_len = features["src_len"] with tf.variable_scope("embeddings"): embeddings = tf.get_variable( "embeddings", [vocab.size(), hps.emb_dim], dtype=tf.float32, initializer=tf.uniform_unit_scaling_initializer()) # [batch_size, src_len, emb_dim] src_encoder_input_emb = tf.nn.embedding_lookup(embeddings, src_inputs) if mode == tf.estimator.ModeKeys.TRAIN and hps.emb_drop > 0.: src_encoder_input_emb = tf.nn.dropout( src_encoder_input_emb, keep_prob=1.0-hps.emb_drop) src_att_context, neighbor_att_context = None, None src_copy_context, neighbor_copy_context = None, None with tf.variable_scope("src_encoder"): # 2 * [batch_size, src_len, encoder_dim] src_encoder_outputs, src_encoder_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=get_rnn_cell( mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=hps.num_encoder_layers, dropout=hps.encoder_drop, cell_type=hps.rnn_cell), cell_bw=get_rnn_cell( mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=hps.num_encoder_layers, dropout=hps.encoder_drop, cell_type=hps.rnn_cell), inputs=src_encoder_input_emb, dtype=tf.float32, sequence_length=src_len) # [batch_size, src_len, 2*encoder_dim] src_encoder_outputs = tf.concat(src_encoder_outputs, 2) with tf.variable_scope("src_att_context"): src_att_context = _build_context( hps=hps, encoder_outputs=src_encoder_outputs) if hps.use_copy: with tf.variable_scope("src_copy_context"): src_copy_context = _build_context( hps=hps, encoder_outputs=src_encoder_outputs) if hps.model == "nn2seq": # [batch_size, neighbor_len] neighbor_inputs = features["neighbor_inputs"] neighbor_len = features["neighbor_len"] # [batch_size, neighbor_len, emb_dim] neighbor_input_emb = tf.nn.embedding_lookup( embeddings, neighbor_inputs) if mode == tf.estimator.ModeKeys.TRAIN and hps.emb_drop > 0.: neighbor_input_emb = tf.nn.dropout( neighbor_input_emb, keep_prob=1.0-hps.emb_drop) with tf.variable_scope("neighbor_encoder"): # 2 * [batch_size, neighbor_len, encoder_dim] neighbor_encoder_outputs, _ = \ tf.nn.bidirectional_dynamic_rnn( cell_fw=get_rnn_cell( mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=1, dropout=hps.encoder_drop, cell_type=hps.rnn_cell), cell_bw=get_rnn_cell( mode=mode, hps=hps, input_dim=hps.emb_dim, num_units=hps.encoder_dim, num_layers=1, dropout=hps.encoder_drop, cell_type=hps.rnn_cell), inputs=neighbor_input_emb, dtype=tf.float32, sequence_length=neighbor_len) neighbor_encoder_outputs = tf.concat(neighbor_encoder_outputs, 2) with tf.variable_scope("neighbor_att_context"): neighbor_att_context = _build_context( hps=hps, encoder_outputs=neighbor_encoder_outputs) if hps.use_copy: with tf.variable_scope("neighbor_copy_context"): neighbor_copy_context = _build_context( hps=hps, encoder_outputs=neighbor_encoder_outputs) att_context, copy_context = None, None if hps.model == "nn2seq": att_context = tf.concat([src_att_context, neighbor_att_context], 1) if hps.use_copy: copy_context = tf.concat( [src_copy_context, neighbor_copy_context], 1) elif hps.model == "seq2seq": att_context = src_att_context if hps.use_copy: copy_context = src_copy_context else: assert False, "baseline `model` should be [`nn2seq`, `seq2seq`]." if hps.use_bridge: with tf.variable_scope("bridge"): out_dim = hps.num_decoder_layers * hps.decoder_dim if hps.rnn_cell == "lstm": fw_states, bw_states = src_encoder_states hs = tf.concat([fw_states[-1].h, bw_states[-1].h], axis=1) cs = tf.concat([fw_states[-1].c, bw_states[-1].c], axis=1) h_state = tf.layers.dense( hs, units=out_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="h_layer") c_state = tf.layers.dense( cs, units=out_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="c_layer") elif hps.rnn_cell == "gru": fw_states, bw_states = src_encoder_states hs = tf.concat([fw_states[-1], bw_states[-1]], axis=1) h_state = tf.layers.dense( hs, units=out_dim, activation=tf.nn.tanh, use_bias=True, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="h_layer") c_state = None else: h_state, c_state = None, None return EncoderOutputs( embeddings=embeddings, att_context=att_context, copy_context=copy_context, states=(h_state, c_state) )
def lenet(use_pretrained=False): # modify from lenet model if use_pretrained == False: # Random initialize weights = { 'conv1': tf.get_variable('LN_conv1_w', [5, 5, 3, 64], initializer=tf.uniform_unit_scaling_initializer()), 'conv2': tf.get_variable('LN_conv2_w', [5, 5, 64, 128], initializer=tf.uniform_unit_scaling_initializer()), 'ip1': tf.get_variable('LN_ip1_w', [5 * 5 * 128, 1024], initializer=tf.uniform_unit_scaling_initializer()), 'ip2': tf.get_variable('LN_ip2_w', [1024, 10], initializer=tf.uniform_unit_scaling_initializer()) } biases = { 'conv1': tf.Variable(tf.random_normal(shape=[64], stddev=0.5), name='LN_conv1_b'), 'conv2': tf.Variable(tf.random_normal(shape=[128], stddev=0.5), name='LN_conv2_b'), 'ip1': tf.Variable(tf.random_normal(shape=[1024], stddev=0.5), name='LN_ip1_b'), 'ip2': tf.Variable(tf.random_normal(shape=[10], stddev=0.5), name='LN_ip2_b') } else: # initialized by pre-trained weight npyfile = np.load('student.npy') npyfile = npyfile.item() weights = { 'conv1': tf.Variable(npyfile['conv1']['weights'], name='LN_conv1_w'), 'conv2': tf.Variable(npyfile['conv2']['weights'], name='LN_conv2_w'), 'ip1': tf.Variable(npyfile['ip1']['weights'], name='LN_ip1_w'), 'ip2': tf.Variable(npyfile['ip2']['weights'], name='LN_ip2_w'), } biases = { 'conv1': tf.Variable(npyfile['conv1']['biases'], name='LN_conv1_b'), 'conv2': tf.Variable(npyfile['conv2']['biases'], name='LN_conv2_b'), 'ip1': tf.Variable(npyfile['ip1']['biases'], name='LN_ip1_b'), 'ip2': tf.Variable(npyfile['ip2']['biases'], name='LN_ip2_b'), } conv1 = conv(x, weights['conv1'], biases['conv1'], padding='VALID') pool1 = maxpool2d(conv1, k=2, s=2) conv2 = conv(pool1, weights['conv2'], biases['conv2'], padding='VALID') pool2 = maxpool2d(conv2, k=2, s=2, padding='VALID') ip1 = tf.reshape(pool2, [-1, weights['ip1'].get_shape().as_list()[0]]) ip1 = tf.add(tf.matmul(ip1, weights['ip1']), biases['ip1']) ip1_relu = tf.nn.relu(ip1) ip2 = tf.add(tf.matmul(ip1_relu, weights['ip2']), biases['ip2']) return ip2
def __init__(self, num_units, mem_input, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None, use_beam=False, hps=None): """Initialize the HyperLSTM cell. Args: num_units: int, The number of units in the LSTM cell. mem_input: mem_input. use_peepholes: bool, use peephole connections or not. cell_clip: (optional) A float value, if provided the cell state is clipped by this value prior to the cell output activation. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within `[-proj_clip, proj_clip]`. num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. forget_bias: float, The bias added to forget gates (see above). Must set to `0.0` manually when restoring from CudnnLSTM-trained checkpoints. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. If False, they are concatenated along the column axis. The latter behavior will soon be deprecated. activation: Activation function of the inner states. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. dtype: Default dtype of the layer (default of `None` means use the type of the first input). Required when `build` is called before `call`. use_beam: Use beam search or not. hps: hyperparameters. """ super(HyperLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) if not state_is_tuple: tf.logging.warn( "%s: Using a concatenated state is slower and will soon " "be deprecated. Use state_is_tuple=True.", self) if num_unit_shards is not None or num_proj_shards is not None: tf.logging.warn( "%s: The num_unit_shards and proj_unit_shards parameters are " "deprecated and will be removed in Jan 2017. " "Use a variable scope with a partitioner instead.", self) assert not use_peepholes, "currently not supporting peephole connections" assert hps is not None # Inputs must be 2-dimensional. self.input_spec = tf.layers.InputSpec(ndim=2) self._num_units = num_units self._rank = hps.rank assert self._rank == self._num_units or self._rank == 2 * self._num_units self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or tf.tanh self._sigma_norm = hps.sigma_norm self._beam_width = hps.beam_width self._mem_input = mem_input self._use_beam = use_beam if num_proj: self._state_size = (tf.nn.rnn_cell.LSTMStateTuple( num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = (tf.nn.rnn_cell.LSTMStateTuple( num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units input_depth = hps.emb_dim + hps.decoder_dim # if hps.encode_neighbor: # input_depth += hps.decoder_dim h_depth = self._num_units if self._num_proj is None else self._num_proj maybe_partitioner = (tf.fixed_size_partitioner(self._num_unit_shards) if self._num_unit_shards is not None else None) # `u`s are matrices of [input_shape, rank], `v`s being [rank, hidden_size] # they are the collection of rank-1 parameter matrices. # The full parameter matrix is constructed by taking `U\sigma V`, # with diagonal matrix `\sigma` computed in the `self.initialize` function. redundant_rank = (self._rank > self._num_units) # `u`, `v` used to construct matrix from input `x` to input_gate `i`. u_xi, v_xi = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xi = tf.get_variable("u_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xi, partitioner=maybe_partitioner) self._v_xi = tf.get_variable("v_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix that maps input `x` to cell_state `j`. u_xj, v_xj = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xj = tf.get_variable("u_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xj, partitioner=maybe_partitioner) self._v_xj = tf.get_variable("v_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to forget_gate `f`. u_xf, v_xf = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xf = tf.get_variable("u_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xf, partitioner=maybe_partitioner) self._v_xf = tf.get_variable("v_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to output_gate `o`. u_xo, v_xo = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xo = tf.get_variable("u_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xo, partitioner=maybe_partitioner) self._v_xo = tf.get_variable("v_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xo, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to input_gate `i`. u_hi, v_hi = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hi = tf.get_variable("u_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hi, partitioner=maybe_partitioner) self._v_hi = tf.get_variable("v_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to cell_state `j`. u_hj, v_hj = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hj = tf.get_variable("u_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hj, partitioner=maybe_partitioner) self._v_hj = tf.get_variable("v_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to forget_gate `f`. u_hf, v_hf = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hf = tf.get_variable("u_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hf, partitioner=maybe_partitioner) self._v_hf = tf.get_variable("v_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to output_gate `o`. u_ho, v_ho = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_ho = tf.get_variable("u_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_ho, partitioner=maybe_partitioner) self._v_ho = tf.get_variable("v_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_ho, partitioner=maybe_partitioner) self._c = tf.get_variable( "c/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._rank], initializer=tf.contrib.layers.xavier_initializer(), partitioner=maybe_partitioner) initializer = tf.zeros_initializer(dtype=tf.float32) self._b = tf.get_variable("b/%s" % _BIAS_VARIABLE_NAME, shape=[4 * h_depth, self._rank], initializer=initializer) if self._num_proj is not None: if self._num_proj_shards is not None: maybe_proj_partitioner = (tf.fixed_size_partitioner( self._num_proj_shards)) else: maybe_proj_partitioner = (None) self._proj_kernel = self.add_variable( "projection/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._num_proj], initializer=tf.uniform_unit_scaling_initializer(), partitioner=maybe_proj_partitioner) self.initialize() self.built = True