예제 #1
0
  def testPartitioners(self):
    partitioners = {
        "gamma": tf.fixed_size_partitioner(num_shards=2),
        "beta": tf.fixed_size_partitioner(num_shards=2),
    }

    inputs = tf.placeholder(tf.float32, shape=[None, 10])
    ln = snt.LayerNorm(partitioners=partitioners)
    self.assertEqual(ln.partitioners, partitioners)
    ln(inputs)

    self.assertEqual(type(ln.gamma), variables.PartitionedVariable)
    self.assertEqual(type(ln.beta), variables.PartitionedVariable)
예제 #2
0
    def testPartitioners(self):
        if tf.executing_eagerly():
            self.skipTest(
                "Partitioned variables are not supported in eager mode.")

        inputs = tf.ones(dtype=tf.float32,
                         shape=[self.batch_size, self.in_size])
        prev_state = tf.ones(dtype=tf.float32,
                             shape=[self.batch_size, self.hidden_size])

        with self.assertRaisesRegexp(KeyError, "Invalid partitioner keys.*"):
            snt.VanillaRNN(name="rnn",
                           hidden_size=self.hidden_size,
                           partitioners={"invalid": None})

        err = "Partitioner for 'w' is not a callable function"
        with self.assertRaisesRegexp(TypeError, err):
            snt.VanillaRNN(
                name="rnn",
                hidden_size=self.hidden_size,
                partitioners={"in_to_hidden": {
                    "w": tf.zeros([10, 10])
                }})

        # Nested partitioners.
        valid_partitioners = {
            "in_to_hidden": {
                "w": tf.fixed_size_partitioner(num_shards=2),
                "b": tf.fixed_size_partitioner(num_shards=2),
            },
            "hidden_to_hidden": {
                "w": tf.fixed_size_partitioner(num_shards=2),
                "b": tf.fixed_size_partitioner(num_shards=2),
            }
        }

        vanilla_rnn = snt.VanillaRNN(name="rnn",
                                     hidden_size=self.hidden_size,
                                     partitioners=valid_partitioners)

        vanilla_rnn(inputs, prev_state)

        self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.w),
                         variables.PartitionedVariable)
        self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.b),
                         variables.PartitionedVariable)
        self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.w),
                         variables.PartitionedVariable)
        self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.b),
                         variables.PartitionedVariable)
예제 #3
0
def create_emb_for_encoder_and_decoder(vocab_size,
                                       embed_size,
                                       dtype=tf.float32,
                                       num_partitions=0,
                                       scope=None):
    """Create embedding matrix for both encoder and decoder."""

    if num_partitions <= 1:
        partitioner = None
    else:
        # Note: num_partitions > 1 is required for distributed training due to
        # embedding_lookup tries to colocate single partition-ed embedding variable
        # with lookup ops. This may cause embedding variables being placed on worker
        # jobs.
        partitioner = tf.fixed_size_partitioner(num_partitions)

    with tf.variable_scope(scope or "embeddings",
                           dtype=dtype,
                           partitioner=partitioner) as scope:
        # Share embedding
        embedding_encoder = tf.get_variable("shared_embedding",
                                            [vocab_size, embed_size], dtype)
        embedding_decoder = embedding_encoder

    return embedding_encoder, embedding_decoder
def model_fn(ids, labels):
    #embedding = tf.get_embedding_variable("var_dist", embedding_dim=24, steps_to_live = 4000, #steps_to_live_l2reg=6000, l2reg_theta=0.01, #'''(10 * 1024 * 1024) * 2''',  steps_to_live_l2reg = 1024*1024, l2reg_theta=0.01, l2reg_lambda=0.01, initializer=tf.ones_initializer, partitioner=tf.fixed_size_partitioner(num_shards=4))
    embedding = tfra.embedding_variable.get_variable(
        name="var_dist",
        embedding_dim=24,
        partitioner=tf.fixed_size_partitioner(num_shards=4),
        initializer=tf.keras.initializers.RandomNormal(0.0, 0.1))

    values = tf.nn.embedding_lookup(embedding, ids)

    features = tf.reshape(values, shape=[1024, 24])

    # W = tf.Variable(tf.zeros([24, 10]), initializer=tf.initializers.random_uniform)
    # b = tf.Variable(tf.zeros([10]), initializer=tf.initializers.random_uniform)
    W = tf.Variable(tf.zeros([24, 10]))
    b = tf.Variable(tf.zeros([10]))

    pred = tf.matmul(features, W) + b

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=labels))

    global_step = tf.train.get_or_create_global_step()
    optimizer = tfra.embedding_variable.AdagradOptimizer(
        learning_rate=0.001).minimize(loss, global_step)

    tf.summary.scalar("loss", loss)
    return loss, optimizer, features
예제 #5
0
  def testPartitioners(self, offset, scale):
    partitioners = {}

    if scale:
      partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2)
    if offset:
      partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2)

    inputs_shape = [10, 10]
    inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape)
    bn = snt.BatchNorm(offset=offset, scale=scale, partitioners=partitioners)
    self.assertEqual(bn.partitioners, partitioners)
    bn(inputs, is_training=True)

    if scale:
      self.assertEqual(type(bn.gamma), variables.PartitionedVariable)
    if offset:
      self.assertEqual(type(bn.beta), variables.PartitionedVariable)
예제 #6
0
    def test_load_with_partitioner_raises_error(self):
        model = self.Model()
        model_dir = self.get_temp_dir()
        tf.saved_model.save(model, model_dir)

        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            self.cluster_resolver, tf1.fixed_size_partitioner(2))
        with self.assertRaisesRegex(ValueError, "`variable_partitioner`"):
            with strategy.scope():
                tf.saved_model.load(model_dir)
예제 #7
0
  def test_load_with_partitioner_raises_error(self):
    model = self.Model()
    model_dir = self.get_temp_dir()
    tf.saved_model.save(model, model_dir)

    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
        self.cluster_resolver, tf1.fixed_size_partitioner(2))
    with self.assertRaises(errors_impl.InvalidArgumentError):
      with strategy.scope():
        tf.saved_model.load(model_dir)
예제 #8
0
  def test_sharded_variable(self):
    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
        self.cluster_resolver, tf1.fixed_size_partitioner(2))
    model_dir = self.get_temp_dir()
    with strategy.scope():
      m = self.Model()
      self.assertIsInstance(m.v1, sharded_variable.ShardedVariable)
    m.train()
    tf.saved_model.save(m, model_dir)

    self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6])
예제 #9
0
  def testPartitioners(self, offset, scale):
    partitioners = {}

    if scale:
      partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2)
    if offset:
      partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2)

    inputs_shape = [10, 10]
    inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape)
    bn = snt.BatchNormV2(
        offset=offset,
        scale=scale,
        partitioners=partitioners)
    self.assertEqual(bn.partitioners, partitioners)
    bn(inputs, is_training=True)

    if scale:
      self.assertLen(tf.global_variables("batch_norm/gamma"), 2)
    if offset:
      self.assertLen(tf.global_variables("batch_norm/beta"), 2)
예제 #10
0
    def testPartitionedVariable(self):
        save_path = os.path.join(self.get_temp_dir(), 'partitioned_variable')
        var_name = 'my_partitioned_var'

        g1 = tf.Graph()
        with g1.as_default():

            def initializer1(shape, dtype, partition_info):
                _ = partition_info  # Not used for creation.
                return tf.constant(True, dtype, shape)

            partitioned_var1 = list(
                tf.get_variable(var_name,
                                shape=[1 << 3, 10],
                                partitioner=tf.fixed_size_partitioner(4),
                                initializer=initializer1,
                                dtype=tf.bool))

            with self.test_session(graph=g1) as session:
                with tf.device('/cpu:0'):
                    tf.global_variables_initializer().run()
                    pv1 = session.run(partitioned_var1)
                    save = tf.train.Saver(partitioned_var1)
                    save.save(session, save_path)

        g2 = tf.Graph()
        with g2.as_default():
            initializer2 = initializers.restore_initializer(
                save_path, var_name, '')
            partitioned_var2 = list(
                tf.get_variable(var_name,
                                shape=[1 << 3, 10],
                                partitioner=tf.fixed_size_partitioner(4),
                                initializer=initializer2,
                                dtype=tf.bool))
            with self.test_session(graph=g2) as session:
                tf.global_variables_initializer().run()
                pv2 = session.run(partitioned_var2)

        self.assertAllEqual(pv1, pv2)
예제 #11
0
  def testPartitioners(self):
    if tf.executing_eagerly():
      self.skipTest("Eager does not support partitioned variables.")

    partitioners = {
        "w": tf.fixed_size_partitioner(num_shards=2),
        "b": tf.fixed_size_partitioner(num_shards=2),
    }

    alex_net = snt.nets.AlexNetMini(
        partitioners=partitioners, name="alexnet1")

    input_shape = [alex_net._min_size, alex_net._min_size, 3]
    inputs = tf.placeholder(tf.float32, shape=[None] + input_shape)
    alex_net(inputs)

    for conv_module in alex_net.conv_modules:
      self.assertEqual(type(conv_module.w), variables.PartitionedVariable)
      self.assertEqual(type(conv_module.b), variables.PartitionedVariable)

    for linear_module in alex_net.linear_modules:
      self.assertEqual(type(linear_module.w), variables.PartitionedVariable)
      self.assertEqual(type(linear_module.b), variables.PartitionedVariable)
예제 #12
0
    def setUp(self):
        super(MLPTest, self).setUp()

        self.output_sizes = [11, 13, 17]
        self.batch_size = 5
        self.input_size = 7
        self.module_name = "mlp"
        self.initializers = {
            "w": tf.truncated_normal_initializer(stddev=1.0),
        }
        self.regularizers = {
            "w": contrib_layers.l1_regularizer(scale=0.1),
        }
        self.partitioners = {
            "w": tf.fixed_size_partitioner(num_shards=2),
        }
예제 #13
0
 def test_return_all_variables_from_checkpoint_with_partition(self):
     with tf.Graph().as_default():
         partitioner = tf.fixed_size_partitioner(2)
         variables = [
             tf.get_variable(name='weights',
                             shape=(2, 2),
                             partitioner=partitioner),
             tf.Variable([1.0, 2.0], name='biases')
         ]
         checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
         init_op = tf.global_variables_initializer()
         saver = tf.train.Saver(variables)
         with self.test_session() as sess:
             sess.run(init_op)
             saver.save(sess, checkpoint_path)
         out_variables = variables_helper.get_variables_available_in_checkpoint(
             variables, checkpoint_path)
     self.assertCountEqual(out_variables, variables)
예제 #14
0
    def test_sharded_variable(self):
        strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
            self.cluster_resolver, tf1.fixed_size_partitioner(2))
        model_dir = self.get_temp_dir()
        with strategy.scope():
            m = self.Model()
            self.assertIsInstance(m.v1, sharded_variable.ShardedVariable)
        m.train()
        tf.saved_model.save(m, model_dir)

        # ShardedVariable loading only works in v1.
        self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}),
                            [6, 6, 6, 6])

        with self.assertRaisesWithLiteralMatch(
                ValueError, "Loading `ShardedVariable` is not supported"):
            with strategy.scope():
                tf.saved_model.load(model_dir)

        with self.assertRaisesWithLiteralMatch(
                ValueError, "Loading `ShardedVariable` is not supported"):
            tf.saved_model.load(model_dir)
예제 #15
0
def create_emb_for_encoder_and_decoder(src_vocab_size,
                                       tgt_vocab_size,
                                       src_embed_size,
                                       tgt_embed_size,
                                       dtype=tf.float32,
                                       num_enc_partitions=0,
                                       num_dec_partitions=0,
                                       src_vocab_file=None,
                                       tgt_vocab_file=None,
                                       src_embed_file=None,
                                       tgt_embed_file=None,
                                       scope=None):
  """Create embedding matrix for both encoder and decoder.

  Args:
    src_vocab_size: An integer. The source vocab size.
    tgt_vocab_size: An integer. The target vocab size.
    src_embed_size: An integer. The embedding dimension for the encoder's
      embedding.
    tgt_embed_size: An integer. The embedding dimension for the decoder's
      embedding.
    dtype: dtype of the embedding matrix. Default to float32.
    num_enc_partitions: number of partitions used for the encoder's embedding
      vars.
    num_dec_partitions: number of partitions used for the decoder's embedding
      vars.
    src_vocab_file: A string. The source vocabulary file.
    tgt_vocab_file: A string. The target vocabulary file.
    src_embed_file: A string. The source embedding file.
    tgt_embed_file: A string. The target embedding file.
    scope: VariableScope for the created subgraph. Default to "embedding".

  Returns:
    embedding_encoder: Encoder's embedding matrix.
    embedding_decoder: Decoder's embedding matrix.

  Raises:
    ValueError: if source and target have different vocab size.
  """
  if num_enc_partitions <= 1:
    enc_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)

  if num_dec_partitions <= 1:
    dec_partitioner = None
  else:
    # Note: num_partitions > 1 is required for distributed training due to
    # embedding_lookup tries to colocate single partition-ed embedding variable
    # with lookup ops. This may cause embedding variables being placed on worker
    # jobs.
    dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)

  if src_embed_file and enc_partitioner:
    raise ValueError(
        "Can't set num_enc_partitions > 1 when using pretrained encoder "
        "embedding")

  if tgt_embed_file and dec_partitioner:
    raise ValueError(
        "Can't set num_dec_partitions > 1 when using pretrained decdoer "
        "embedding")

  with tf.variable_scope(
      scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope:
    if src_vocab_size != tgt_vocab_size:
      raise ValueError("Share embedding but different src/tgt vocab sizes"
                       " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
    assert src_embed_size == tgt_embed_size
    utils.print_out("# Use the same embedding for source and target")
    vocab_file = src_vocab_file or tgt_vocab_file
    embed_file = src_embed_file or tgt_embed_file

    embedding_encoder = _create_or_load_embed(
        "embedding_share", vocab_file, embed_file,
        src_vocab_size, src_embed_size, dtype)
    embedding_decoder = embedding_encoder

  return embedding_encoder, embedding_decoder
예제 #16
0
파일: hyperlstm.py 프로젝트: yyht/language
    def __init__(self,
                 num_units,
                 mem_input,
                 use_peepholes=False,
                 cell_clip=None,
                 initializer=None,
                 num_proj=None,
                 proj_clip=None,
                 num_unit_shards=None,
                 num_proj_shards=None,
                 forget_bias=1.0,
                 state_is_tuple=True,
                 activation=None,
                 reuse=None,
                 name=None,
                 dtype=None,
                 use_beam=False,
                 hps=None):
        """Initialize the HyperLSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell.
      mem_input: mem_input.
      use_peepholes: bool, use peephole connections or not.
      cell_clip: (optional) A float value, if provided the cell state is clipped
        by this value prior to the cell output activation.
      initializer: (optional) The initializer to use for the weight and
        projection matrices.
      num_proj: (optional) int, The output dimensionality for the projection
        matrices.  If None, no projection is performed.
      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
        provided, then the projected values are clipped elementwise to within
        `[-proj_clip, proj_clip]`.
      num_unit_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      num_proj_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      forget_bias: float, The bias added to forget gates (see above).
        Must set to `0.0` manually when restoring from CudnnLSTM-trained
        checkpoints.
      state_is_tuple: If True, accepted and returned states are 2-tuples of
        the `c_state` and `m_state`.  If False, they are concatenated
        along the column axis.  The latter behavior will soon be deprecated.
      activation: Activation function of the inner states.  Default: `tanh`.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.
      name: String, the name of the layer. Layers with the same name will
        share weights, but to avoid mistakes we require reuse=True in such
        cases.
      dtype: Default dtype of the layer (default of `None` means use the type
        of the first input). Required when `build` is called before `call`.
      use_beam: Use beam search or not.
      hps: hyperparameters.
    """

        super(HyperLSTMCell, self).__init__(_reuse=reuse,
                                            name=name,
                                            dtype=dtype)
        if not state_is_tuple:
            tf.logging.warn(
                "%s: Using a concatenated state is slower and will soon "
                "be deprecated.  Use state_is_tuple=True.", self)
        if num_unit_shards is not None or num_proj_shards is not None:
            tf.logging.warn(
                "%s: The num_unit_shards and proj_unit_shards parameters are "
                "deprecated and will be removed in Jan 2017.  "
                "Use a variable scope with a partitioner instead.", self)

        assert not use_peepholes, "currently not supporting peephole connections"
        assert hps is not None
        # Inputs must be 2-dimensional.
        self.input_spec = tf.layers.InputSpec(ndim=2)

        self._num_units = num_units
        self._rank = hps.rank
        assert self._rank == self._num_units or self._rank == 2 * self._num_units
        self._use_peepholes = use_peepholes
        self._cell_clip = cell_clip
        self._initializer = initializer
        self._num_proj = num_proj
        self._proj_clip = proj_clip
        self._num_unit_shards = num_unit_shards
        self._num_proj_shards = num_proj_shards
        self._forget_bias = forget_bias
        self._state_is_tuple = state_is_tuple
        self._activation = activation or tf.tanh
        self._sigma_norm = hps.sigma_norm
        self._beam_width = hps.beam_width
        self._mem_input = mem_input
        self._use_beam = use_beam

        if num_proj:
            self._state_size = (tf.nn.rnn_cell.LSTMStateTuple(
                num_units, num_proj) if state_is_tuple else num_units +
                                num_proj)
            self._output_size = num_proj
        else:
            self._state_size = (tf.nn.rnn_cell.LSTMStateTuple(
                num_units, num_units) if state_is_tuple else 2 * num_units)
            self._output_size = num_units

        input_depth = hps.emb_dim + hps.decoder_dim
        # if hps.encode_neighbor:
        #   input_depth += hps.decoder_dim
        h_depth = self._num_units if self._num_proj is None else self._num_proj

        maybe_partitioner = (tf.fixed_size_partitioner(self._num_unit_shards)
                             if self._num_unit_shards is not None else None)

        # `u`s are matrices of [input_shape, rank], `v`s being [rank, hidden_size]
        # they are the collection of rank-1 parameter matrices.
        # The full parameter matrix is constructed by taking `U\sigma V`,
        # with diagonal matrix `\sigma` computed in the `self.initialize` function.

        redundant_rank = (self._rank > self._num_units)
        # `u`, `v` used to construct matrix from input `x` to input_gate `i`.
        u_xi, v_xi = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xi = tf.get_variable("u_xi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xi,
                                     partitioner=maybe_partitioner)
        self._v_xi = tf.get_variable("v_xi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xi,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix that maps input `x` to cell_state `j`.
        u_xj, v_xj = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xj = tf.get_variable("u_xj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xj,
                                     partitioner=maybe_partitioner)
        self._v_xj = tf.get_variable("v_xj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xj,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps input `x` to forget_gate `f`.
        u_xf, v_xf = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xf = tf.get_variable("u_xf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xf,
                                     partitioner=maybe_partitioner)
        self._v_xf = tf.get_variable("v_xf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xf,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps input `x` to output_gate `o`.
        u_xo, v_xo = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xo = tf.get_variable("u_xo/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xo,
                                     partitioner=maybe_partitioner)
        self._v_xo = tf.get_variable("v_xo/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xo,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to input_gate `i`.
        u_hi, v_hi = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hi = tf.get_variable("u_hi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hi,
                                     partitioner=maybe_partitioner)
        self._v_hi = tf.get_variable("v_hi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hi,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to cell_state `j`.
        u_hj, v_hj = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hj = tf.get_variable("u_hj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hj,
                                     partitioner=maybe_partitioner)
        self._v_hj = tf.get_variable("v_hj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hj,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to forget_gate `f`.
        u_hf, v_hf = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hf = tf.get_variable("u_hf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hf,
                                     partitioner=maybe_partitioner)
        self._v_hf = tf.get_variable("v_hf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hf,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to output_gate `o`.
        u_ho, v_ho = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_ho = tf.get_variable("u_ho/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_ho,
                                     partitioner=maybe_partitioner)
        self._v_ho = tf.get_variable("v_ho/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_ho,
                                     partitioner=maybe_partitioner)

        self._c = tf.get_variable(
            "c/%s" % _WEIGHTS_VARIABLE_NAME,
            shape=[self._num_units, self._rank],
            initializer=tf.contrib.layers.xavier_initializer(),
            partitioner=maybe_partitioner)

        initializer = tf.zeros_initializer(dtype=tf.float32)
        self._b = tf.get_variable("b/%s" % _BIAS_VARIABLE_NAME,
                                  shape=[4 * h_depth, self._rank],
                                  initializer=initializer)

        if self._num_proj is not None:
            if self._num_proj_shards is not None:
                maybe_proj_partitioner = (tf.fixed_size_partitioner(
                    self._num_proj_shards))
            else:
                maybe_proj_partitioner = (None)
            self._proj_kernel = self.add_variable(
                "projection/%s" % _WEIGHTS_VARIABLE_NAME,
                shape=[self._num_units, self._num_proj],
                initializer=tf.uniform_unit_scaling_initializer(),
                partitioner=maybe_proj_partitioner)
        self.initialize()
        self.built = True