def testPartitioners(self): partitioners = { "gamma": tf.fixed_size_partitioner(num_shards=2), "beta": tf.fixed_size_partitioner(num_shards=2), } inputs = tf.placeholder(tf.float32, shape=[None, 10]) ln = snt.LayerNorm(partitioners=partitioners) self.assertEqual(ln.partitioners, partitioners) ln(inputs) self.assertEqual(type(ln.gamma), variables.PartitionedVariable) self.assertEqual(type(ln.beta), variables.PartitionedVariable)
def testPartitioners(self): if tf.executing_eagerly(): self.skipTest( "Partitioned variables are not supported in eager mode.") inputs = tf.ones(dtype=tf.float32, shape=[self.batch_size, self.in_size]) prev_state = tf.ones(dtype=tf.float32, shape=[self.batch_size, self.hidden_size]) with self.assertRaisesRegexp(KeyError, "Invalid partitioner keys.*"): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, partitioners={"invalid": None}) err = "Partitioner for 'w' is not a callable function" with self.assertRaisesRegexp(TypeError, err): snt.VanillaRNN( name="rnn", hidden_size=self.hidden_size, partitioners={"in_to_hidden": { "w": tf.zeros([10, 10]) }}) # Nested partitioners. valid_partitioners = { "in_to_hidden": { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), }, "hidden_to_hidden": { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), } } vanilla_rnn = snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, partitioners=valid_partitioners) vanilla_rnn(inputs, prev_state) self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.w), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.in_to_hidden_linear.b), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.w), variables.PartitionedVariable) self.assertEqual(type(vanilla_rnn.hidden_to_hidden_linear.b), variables.PartitionedVariable)
def create_emb_for_encoder_and_decoder(vocab_size, embed_size, dtype=tf.float32, num_partitions=0, scope=None): """Create embedding matrix for both encoder and decoder.""" if num_partitions <= 1: partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. partitioner = tf.fixed_size_partitioner(num_partitions) with tf.variable_scope(scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope: # Share embedding embedding_encoder = tf.get_variable("shared_embedding", [vocab_size, embed_size], dtype) embedding_decoder = embedding_encoder return embedding_encoder, embedding_decoder
def model_fn(ids, labels): #embedding = tf.get_embedding_variable("var_dist", embedding_dim=24, steps_to_live = 4000, #steps_to_live_l2reg=6000, l2reg_theta=0.01, #'''(10 * 1024 * 1024) * 2''', steps_to_live_l2reg = 1024*1024, l2reg_theta=0.01, l2reg_lambda=0.01, initializer=tf.ones_initializer, partitioner=tf.fixed_size_partitioner(num_shards=4)) embedding = tfra.embedding_variable.get_variable( name="var_dist", embedding_dim=24, partitioner=tf.fixed_size_partitioner(num_shards=4), initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)) values = tf.nn.embedding_lookup(embedding, ids) features = tf.reshape(values, shape=[1024, 24]) # W = tf.Variable(tf.zeros([24, 10]), initializer=tf.initializers.random_uniform) # b = tf.Variable(tf.zeros([10]), initializer=tf.initializers.random_uniform) W = tf.Variable(tf.zeros([24, 10])) b = tf.Variable(tf.zeros([10])) pred = tf.matmul(features, W) + b loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=labels)) global_step = tf.train.get_or_create_global_step() optimizer = tfra.embedding_variable.AdagradOptimizer( learning_rate=0.001).minimize(loss, global_step) tf.summary.scalar("loss", loss) return loss, optimizer, features
def testPartitioners(self, offset, scale): partitioners = {} if scale: partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2) if offset: partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2) inputs_shape = [10, 10] inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape) bn = snt.BatchNorm(offset=offset, scale=scale, partitioners=partitioners) self.assertEqual(bn.partitioners, partitioners) bn(inputs, is_training=True) if scale: self.assertEqual(type(bn.gamma), variables.PartitionedVariable) if offset: self.assertEqual(type(bn.beta), variables.PartitionedVariable)
def test_load_with_partitioner_raises_error(self): model = self.Model() model_dir = self.get_temp_dir() tf.saved_model.save(model, model_dir) strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( self.cluster_resolver, tf1.fixed_size_partitioner(2)) with self.assertRaisesRegex(ValueError, "`variable_partitioner`"): with strategy.scope(): tf.saved_model.load(model_dir)
def test_load_with_partitioner_raises_error(self): model = self.Model() model_dir = self.get_temp_dir() tf.saved_model.save(model, model_dir) strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( self.cluster_resolver, tf1.fixed_size_partitioner(2)) with self.assertRaises(errors_impl.InvalidArgumentError): with strategy.scope(): tf.saved_model.load(model_dir)
def test_sharded_variable(self): strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( self.cluster_resolver, tf1.fixed_size_partitioner(2)) model_dir = self.get_temp_dir() with strategy.scope(): m = self.Model() self.assertIsInstance(m.v1, sharded_variable.ShardedVariable) m.train() tf.saved_model.save(m, model_dir) self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6])
def testPartitioners(self, offset, scale): partitioners = {} if scale: partitioners["gamma"] = tf.fixed_size_partitioner(num_shards=2) if offset: partitioners["beta"] = tf.fixed_size_partitioner(num_shards=2) inputs_shape = [10, 10] inputs = tf.placeholder(tf.float32, shape=[None] + inputs_shape) bn = snt.BatchNormV2( offset=offset, scale=scale, partitioners=partitioners) self.assertEqual(bn.partitioners, partitioners) bn(inputs, is_training=True) if scale: self.assertLen(tf.global_variables("batch_norm/gamma"), 2) if offset: self.assertLen(tf.global_variables("batch_norm/beta"), 2)
def testPartitionedVariable(self): save_path = os.path.join(self.get_temp_dir(), 'partitioned_variable') var_name = 'my_partitioned_var' g1 = tf.Graph() with g1.as_default(): def initializer1(shape, dtype, partition_info): _ = partition_info # Not used for creation. return tf.constant(True, dtype, shape) partitioned_var1 = list( tf.get_variable(var_name, shape=[1 << 3, 10], partitioner=tf.fixed_size_partitioner(4), initializer=initializer1, dtype=tf.bool)) with self.test_session(graph=g1) as session: with tf.device('/cpu:0'): tf.global_variables_initializer().run() pv1 = session.run(partitioned_var1) save = tf.train.Saver(partitioned_var1) save.save(session, save_path) g2 = tf.Graph() with g2.as_default(): initializer2 = initializers.restore_initializer( save_path, var_name, '') partitioned_var2 = list( tf.get_variable(var_name, shape=[1 << 3, 10], partitioner=tf.fixed_size_partitioner(4), initializer=initializer2, dtype=tf.bool)) with self.test_session(graph=g2) as session: tf.global_variables_initializer().run() pv2 = session.run(partitioned_var2) self.assertAllEqual(pv1, pv2)
def testPartitioners(self): if tf.executing_eagerly(): self.skipTest("Eager does not support partitioned variables.") partitioners = { "w": tf.fixed_size_partitioner(num_shards=2), "b": tf.fixed_size_partitioner(num_shards=2), } alex_net = snt.nets.AlexNetMini( partitioners=partitioners, name="alexnet1") input_shape = [alex_net._min_size, alex_net._min_size, 3] inputs = tf.placeholder(tf.float32, shape=[None] + input_shape) alex_net(inputs) for conv_module in alex_net.conv_modules: self.assertEqual(type(conv_module.w), variables.PartitionedVariable) self.assertEqual(type(conv_module.b), variables.PartitionedVariable) for linear_module in alex_net.linear_modules: self.assertEqual(type(linear_module.w), variables.PartitionedVariable) self.assertEqual(type(linear_module.b), variables.PartitionedVariable)
def setUp(self): super(MLPTest, self).setUp() self.output_sizes = [11, 13, 17] self.batch_size = 5 self.input_size = 7 self.module_name = "mlp" self.initializers = { "w": tf.truncated_normal_initializer(stddev=1.0), } self.regularizers = { "w": contrib_layers.l1_regularizer(scale=0.1), } self.partitioners = { "w": tf.fixed_size_partitioner(num_shards=2), }
def test_return_all_variables_from_checkpoint_with_partition(self): with tf.Graph().as_default(): partitioner = tf.fixed_size_partitioner(2) variables = [ tf.get_variable(name='weights', shape=(2, 2), partitioner=partitioner), tf.Variable([1.0, 2.0], name='biases') ] checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') init_op = tf.global_variables_initializer() saver = tf.train.Saver(variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) out_variables = variables_helper.get_variables_available_in_checkpoint( variables, checkpoint_path) self.assertCountEqual(out_variables, variables)
def test_sharded_variable(self): strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( self.cluster_resolver, tf1.fixed_size_partitioner(2)) model_dir = self.get_temp_dir() with strategy.scope(): m = self.Model() self.assertIsInstance(m.v1, sharded_variable.ShardedVariable) m.train() tf.saved_model.save(m, model_dir) # ShardedVariable loading only works in v1. self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6]) with self.assertRaisesWithLiteralMatch( ValueError, "Loading `ShardedVariable` is not supported"): with strategy.scope(): tf.saved_model.load(model_dir) with self.assertRaisesWithLiteralMatch( ValueError, "Loading `ShardedVariable` is not supported"): tf.saved_model.load(model_dir)
def create_emb_for_encoder_and_decoder(src_vocab_size, tgt_vocab_size, src_embed_size, tgt_embed_size, dtype=tf.float32, num_enc_partitions=0, num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, scope=None): """Create embedding matrix for both encoder and decoder. Args: src_vocab_size: An integer. The source vocab size. tgt_vocab_size: An integer. The target vocab size. src_embed_size: An integer. The embedding dimension for the encoder's embedding. tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. num_enc_partitions: number of partitions used for the encoder's embedding vars. num_dec_partitions: number of partitions used for the decoder's embedding vars. src_vocab_file: A string. The source vocabulary file. tgt_vocab_file: A string. The target vocabulary file. src_embed_file: A string. The source embedding file. tgt_embed_file: A string. The target embedding file. scope: VariableScope for the created subgraph. Default to "embedding". Returns: embedding_encoder: Encoder's embedding matrix. embedding_decoder: Decoder's embedding matrix. Raises: ValueError: if source and target have different vocab size. """ if num_enc_partitions <= 1: enc_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) if num_dec_partitions <= 1: dec_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) if src_embed_file and enc_partitioner: raise ValueError( "Can't set num_enc_partitions > 1 when using pretrained encoder " "embedding") if tgt_embed_file and dec_partitioner: raise ValueError( "Can't set num_dec_partitions > 1 when using pretrained decdoer " "embedding") with tf.variable_scope( scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: if src_vocab_size != tgt_vocab_size: raise ValueError("Share embedding but different src/tgt vocab sizes" " %d vs. %d" % (src_vocab_size, tgt_vocab_size)) assert src_embed_size == tgt_embed_size utils.print_out("# Use the same embedding for source and target") vocab_file = src_vocab_file or tgt_vocab_file embed_file = src_embed_file or tgt_embed_file embedding_encoder = _create_or_load_embed( "embedding_share", vocab_file, embed_file, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder return embedding_encoder, embedding_decoder
def __init__(self, num_units, mem_input, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None, use_beam=False, hps=None): """Initialize the HyperLSTM cell. Args: num_units: int, The number of units in the LSTM cell. mem_input: mem_input. use_peepholes: bool, use peephole connections or not. cell_clip: (optional) A float value, if provided the cell state is clipped by this value prior to the cell output activation. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within `[-proj_clip, proj_clip]`. num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. forget_bias: float, The bias added to forget gates (see above). Must set to `0.0` manually when restoring from CudnnLSTM-trained checkpoints. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. If False, they are concatenated along the column axis. The latter behavior will soon be deprecated. activation: Activation function of the inner states. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. dtype: Default dtype of the layer (default of `None` means use the type of the first input). Required when `build` is called before `call`. use_beam: Use beam search or not. hps: hyperparameters. """ super(HyperLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) if not state_is_tuple: tf.logging.warn( "%s: Using a concatenated state is slower and will soon " "be deprecated. Use state_is_tuple=True.", self) if num_unit_shards is not None or num_proj_shards is not None: tf.logging.warn( "%s: The num_unit_shards and proj_unit_shards parameters are " "deprecated and will be removed in Jan 2017. " "Use a variable scope with a partitioner instead.", self) assert not use_peepholes, "currently not supporting peephole connections" assert hps is not None # Inputs must be 2-dimensional. self.input_spec = tf.layers.InputSpec(ndim=2) self._num_units = num_units self._rank = hps.rank assert self._rank == self._num_units or self._rank == 2 * self._num_units self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or tf.tanh self._sigma_norm = hps.sigma_norm self._beam_width = hps.beam_width self._mem_input = mem_input self._use_beam = use_beam if num_proj: self._state_size = (tf.nn.rnn_cell.LSTMStateTuple( num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = (tf.nn.rnn_cell.LSTMStateTuple( num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units input_depth = hps.emb_dim + hps.decoder_dim # if hps.encode_neighbor: # input_depth += hps.decoder_dim h_depth = self._num_units if self._num_proj is None else self._num_proj maybe_partitioner = (tf.fixed_size_partitioner(self._num_unit_shards) if self._num_unit_shards is not None else None) # `u`s are matrices of [input_shape, rank], `v`s being [rank, hidden_size] # they are the collection of rank-1 parameter matrices. # The full parameter matrix is constructed by taking `U\sigma V`, # with diagonal matrix `\sigma` computed in the `self.initialize` function. redundant_rank = (self._rank > self._num_units) # `u`, `v` used to construct matrix from input `x` to input_gate `i`. u_xi, v_xi = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xi = tf.get_variable("u_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xi, partitioner=maybe_partitioner) self._v_xi = tf.get_variable("v_xi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix that maps input `x` to cell_state `j`. u_xj, v_xj = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xj = tf.get_variable("u_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xj, partitioner=maybe_partitioner) self._v_xj = tf.get_variable("v_xj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to forget_gate `f`. u_xf, v_xf = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xf = tf.get_variable("u_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xf, partitioner=maybe_partitioner) self._v_xf = tf.get_variable("v_xf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps input `x` to output_gate `o`. u_xo, v_xo = self._orthogonal_init( shape=[input_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_xo = tf.get_variable("u_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_xo, partitioner=maybe_partitioner) self._v_xo = tf.get_variable("v_xo/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_xo, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to input_gate `i`. u_hi, v_hi = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hi = tf.get_variable("u_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hi, partitioner=maybe_partitioner) self._v_hi = tf.get_variable("v_hi/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hi, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to cell_state `j`. u_hj, v_hj = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hj = tf.get_variable("u_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hj, partitioner=maybe_partitioner) self._v_hj = tf.get_variable("v_hj/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hj, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to forget_gate `f`. u_hf, v_hf = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_hf = tf.get_variable("u_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_hf, partitioner=maybe_partitioner) self._v_hf = tf.get_variable("v_hf/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_hf, partitioner=maybe_partitioner) # `u`, `v` used to construct matrix # that maps hid_state `h` to output_gate `o`. u_ho, v_ho = self._orthogonal_init(shape=[h_depth, self._num_units], initializer=initializer, redundant_rank=redundant_rank) self._u_ho = tf.get_variable("u_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=u_ho, partitioner=maybe_partitioner) self._v_ho = tf.get_variable("v_ho/%s" % _WEIGHTS_VARIABLE_NAME, initializer=v_ho, partitioner=maybe_partitioner) self._c = tf.get_variable( "c/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._rank], initializer=tf.contrib.layers.xavier_initializer(), partitioner=maybe_partitioner) initializer = tf.zeros_initializer(dtype=tf.float32) self._b = tf.get_variable("b/%s" % _BIAS_VARIABLE_NAME, shape=[4 * h_depth, self._rank], initializer=initializer) if self._num_proj is not None: if self._num_proj_shards is not None: maybe_proj_partitioner = (tf.fixed_size_partitioner( self._num_proj_shards)) else: maybe_proj_partitioner = (None) self._proj_kernel = self.add_variable( "projection/%s" % _WEIGHTS_VARIABLE_NAME, shape=[self._num_units, self._num_proj], initializer=tf.uniform_unit_scaling_initializer(), partitioner=maybe_proj_partitioner) self.initialize() self.built = True