def _setup_last_update_step(self): with tf.variable_scope(self._spec.name, use_resource=self._spec.use_tpu) as scope: try: last_update_step = tf.get_variable( 'last_mask_update_step', [], initializer=tf.zeros_initializer(), trainable=False, dtype=tf.int32) except ValueError: scope.reuse_variables() last_update_step = tf.get_variable('last_mask_update_step', dtype=tf.int32) return last_update_step
def testExponentialMovingAverageIncludingMovingVars(self): task = BaseTaskTest.TestParams() task.input = base_input_generator.BaseSequenceInputGenerator.Params() task.train.ema_decay = 0.9 task.train.ema_decay_moving_vars = True p = base_model.SingleTaskModel.Params(task) model = p.Instantiate() self.assertIsNotNone(model.ema) model.ConstructFPropBPropGraph() with tf.variable_scope('base_mdl', reuse=True): beta = tf.get_variable('x/beta/var') mean = tf.get_variable('x/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNotNone(model.ema.average(mean))
def testExponentialMovingAverage(self): p = base_model.SingleTaskModel.Params() p.task = BaseTaskTest.TestParams() p.task.input = base_input_generator.BaseSequenceInputGenerator.Params() p.train.ema_decay = 0.9 model = p.Instantiate() model._task.CreateChild('a', layers.BatchNormLayer.Params().Set(name='a', dim=1)) model._task._train_op = tf.no_op() model._task.ApplyExponentialMovingAverage(model.ema) with tf.variable_scope('', reuse=True): beta = tf.get_variable('a/beta/var') mean = tf.get_variable('a/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNone(model.ema.average(mean))
def testExponentialMovingAverage(self): p = base_model.SingleTaskModel.Params() p.task = BaseTaskTest.TestParams() p.task.input = base_input_generator.BaseSequenceInputGenerator.Params() p.task.train.ema_decay = 0.9 p.task.train.ema_decay_moving_vars = False model = p.Instantiate() task = model._task task._train_op = tf.no_op() task.ApplyExponentialMovingAverage(model.ema) with tf.variable_scope('base_mdl', reuse=True): beta = tf.get_variable('x/beta/var') mean = tf.get_variable('x/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNone(model.ema.average(mean))
def testNoPS(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.worker.replicas = 1 p.ps.name = '/job:trainer' p.ps.replicas = 1 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0))
def _CreateVariableStub(name, params, reuse=None, trainable=True, collections=None, default_seed=None, synchronization=None, aggregation=None): """Return a zero tensor of the right shape instead of creating variable.""" del reuse del default_seed del synchronization del aggregation dtype = params.dtype shape = py_utils.ToStaticShape(params.shape) # For total samples counters we have to actually create variables so that # we can access the 'value' attribute during construction. if 'total_samples' in name: var = tf.get_variable(name, shape, dtype, tf.constant_initializer(0), collections=collections, trainable=trainable, validate_shape=True) else: key = (tf.get_default_graph(), tuple(shape)) if key in variable_cache: var = variable_cache[key] else: var = tf.zeros(shape, dtype) variable_cache[key] = var return var, var
def testDefaultParams(self): p = cluster_factory.Cluster.Params() c = cluster_factory.Cluster(p) self.assertFalse(c.add_summary) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, c._MakeDeviceString( job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, c._MakeDeviceString( job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0))
def testPSWithGPUs(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.worker.replicas = 1 p.ps.name = '/job:ps' p.ps.replicas = 4 p.ps.gpus_per_replica = 2 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for i, v in enumerate(vs): self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:ps', task_id=(i / 2) % 4, device_name='GPU', device_id=i % 2)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0))
def ModuleFn(training): """Builds the graph and signature for the stub TF-hub module.""" image_data = tf.placeholder( shape=[None, input_image_height, input_image_width, 3], dtype=tf.float32) # Linearly project image_data to shape [1, output_feature_dim] features. encoder_output = tf.compat.v1.layers.dense( tf.reshape(image_data, [-1, input_image_height * input_image_width * 3]), output_feature_dim) # Add a non-trainable 'count' variable that can be updated through an # UPDATE_OP. This is analogous to a batch-norm moving average that should be # updated during fine-tuning. v = tf.get_variable('count', initializer=0, dtype=tf.int32, trainable=False) if training: update_op = v.assign_add(1).op tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op) hub.add_signature('default', inputs={'images': image_data}, outputs=encoder_output)
def testDefaultParamsWithDynamicShape(self): p = cluster_factory.Cluster.Params() c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): dyn_shape = tf.constant([2], dtype=tf.int32) dyn_shape = tf.placeholder_with_default(dyn_shape, shape=[None]) v = tf.get_variable('x%d_wb/var' % i, initializer=tf.random.uniform( dyn_shape, dtype=tf.float64), validate_shape=False) vs.append(v) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0))
def _CreateVariableStub(name, params, reuse=None, trainable=True, init_wrapper=None, collections=None): """Return a zero tensor of the right shape instead of creating variable.""" del reuse dtype = params.dtype shape = py_utils.ToStaticShape(params.shape) if init_wrapper: var = init_wrapper(dtype, tf.constant_initializer(0, dtype=dtype)) # For total samples counters we have to actually create variables so that # we can access the 'value' attribute during construction. elif 'total_samples' in name: var = tf.get_variable(name, shape, dtype, tf.constant_initializer(0, dtype=dtype), collections=collections, trainable=trainable, validate_shape=True) else: key = hash(tuple(shape)) if key in variable_cache: var = variable_cache[key] else: var = tf.zeros(shape, dtype) variable_cache[key] = var return var, var
def testExponentialMovingAverage(self): task = BaseTaskTest.TestParams() task.input = base_input_generator.BaseSequenceInputGenerator.Params() task.train.ema_decay = 0.9 task.train.ema_decay_moving_vars = False p = base_model.SingleTaskModel.Params(task) model = p.Instantiate() self.assertIsNotNone(model.ema) model.ConstructFPropBPropGraph() # Test that EMA is accessible by a sublayer. x = model.GetTask().x self.assertIsNotNone(x.ema) self.assertIs(x.ema, model.ema) with tf.variable_scope('base_mdl', reuse=True): beta = tf.get_variable('x/beta/var') mean = tf.get_variable('x/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNone(model.ema.average(mean))
def testFactorizedMaxPool(self, input_shape, window_shape): weights = tf.get_variable("weights", shape=input_shape) pooling_kwargs = { "window_shape": window_shape, "pooling_type": "MAX", "strides": window_shape, "padding": "SAME" } self._compare_pooling_methods(weights, pooling_kwargs)
def testVarWrapperTrackAssign(self): with tf.Graph().as_default(): var = tf.get_variable('v0', shape=[8, 16], dtype=tf.float32) wrapper = var_tmp_wrappers.VarWrapperTrackAssign(var) ones = tf.ones_like(wrapper) a = wrapper.assign(ones) b = wrapper.assign_add(ones) c = wrapper.assign_sub(ones) self.assertSameElements(wrapper.previous_assigns(), [a, b, c])
def testStackedVarWrapperWithManualSharding(self): with tf.Graph().as_default(): var = tf.get_variable('v2', shape=[8, 16], dtype=tf.float32) wrapper = var_tmp_wrappers.StackedVarWrapperWithManualSharding(var) ones = tf.ones_like(wrapper) wrapper.assign(ones) wrapper.assign_add(ones) wrapper.assign_sub(ones) self.assertEqual(ones.shape, [16])
def testTensorPartitioner(self): with tf.Session(): w1 = tf.get_variable('w1', [255, 255], tf.float32) self.evaluate(tf.global_variables_initializer()) partition_info = distributed_shampoo.PartitionConfig(200, 128) grad = tf.constant(w1.eval()) metadata = distributed_shampoo.TensorPartitioner.partition_metadata( w1, partition_info) partitioned_grad = distributed_shampoo.TensorPartitioner.partition_tensor( w1, partition_info) reformed_grad = distributed_shampoo.TensorPartitioner.reform_tensor( partitioned_grad, metadata.num_splits_per_dim) self.assertAllCloseAccordingToType(reformed_grad, grad)
def _create_slots(self, var_list): if not self._counter: self._counter = tf.get_variable( shape=[], initializer=tf.zeros_initializer, name='update_count') for v in var_list: vo = self._opt._zeros_slot(v, 'grad_accum', 'GradientAccumulator') # pylint: disable=protected-access sharding = None try: sharding = gshard_utils.GetVarSharding(v) except ValueError: continue if sharding and not sharding.is_replicated: sharding.ApplyToVariable(vo)
def testPartitionedVariableMasking(self): partitioner = tf.variable_axis_size_partitioner(40) with self.cached_session() as session: with tf.variable_scope("", partitioner=partitioner): sparsity = tf.Variable(0.5, name="Sparsity") weights = tf.get_variable("weights", initializer=tf.linspace( 1.0, 100.0, 100)) masked_weights = pruning.apply_mask( weights, scope=tf.get_variable_scope()) p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() tf.global_variables_initializer().run() masked_weights_val = masked_weights.eval() session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
def weight_threshold_variable(var, scope): """Create a scalar threshold for the weights. This function adds a variable 'threshold' to the graph. Args: var: The weight variable that needs to be masked scope: The variable scope of the variable var Returns: A scalar threshold variable initialized to 0. """ with tf.variable_scope(scope): threshold = tf.get_variable('threshold', [], initializer=tf.zeros_initializer(), trainable=False, dtype=var.dtype) return threshold
def weight_mask_variable(var, scope): """Create a mask for the weights. This function adds a variable 'mask' to the graph. Args: var: the weight variable that needs to be masked scope: The variable scope of the variable var Returns: the mask variable of the same size and shape as var, initialized to all 1s. """ with tf.variable_scope(scope): mask = tf.get_variable('mask', var.get_shape(), initializer=tf.ones_initializer(), trainable=False, dtype=var.dtype) return mask
def testRematerialize(self): # Test the dropout consistency between fprop and bprop. b = builder.Base.Params() b = b.Instantiate() start_block = layers.DeterministicDropoutLayer.Params().Set( name='start_dropout', keep_prob=0.7) # Build 4 dropout layers, each wrapped by RematerializeFn. num_blocks = 4 blocks = [] blocks_per_cell = 2 for i in range(num_blocks): blocks.append(layers.DeterministicDropoutLayer.Params().Set( name='dropout_{}'.format(i), keep_prob=0.7)) cells = [] while blocks: heads, blocks = blocks[:blocks_per_cell], blocks[blocks_per_cell:] cell_name = 'cell_{}'.format(len(cells)) cells.append( b._Rematerialize(name=cell_name, body=b._Seq(cell_name, *heads))) with self.session(use_gpu=False, graph=tf.Graph()) as sess: tf.random.set_seed(12345) p = b._Seq('test', start_block, *cells) mdl = p.Instantiate() # y = mdl.Frop(x * w) # Fake input x = tf.ones([4, 5]) # Construct weights. w = tf.get_variable('w', shape=[4, 5], initializer=tf.constant_initializer([[1] * 5] * 4)) y = mdl.FPropDefaultTheta(x * w) # Construct loss function such that gradients = final activation. # dy/dw = y = mdl.Frop(x * w) when w is 1. loss = tf.reduce_sum(y) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) tf.global_variables_initializer().run() y_val, grads_val = sess.run([y, grads.Transform(tuple)]) grads_val = grads_val['w'][1] self.assertAllClose(y_val, grads_val) self.assertEqual(py_utils.GetStepSeed().eval(), 1553244033)
def testPSRandomSize(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.ps.name = '/job:ps' p.ps.replicas = 10 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] np.random.seed(301) with g.as_default(): with tf.device(c.GetPlacer()): # Creates 200 variables with different sizes. for i in range(200): if i % 13: size = np.random.randint(10000) elif i % 7: size = np.random.randint(100) else: size = np.random.randint(10) vs.append(tf.get_variable('x%d' % i, shape=(size))) sum_all = tf.add_n([tf.reduce_sum(x) for x in vs]) # Computes the total size of variables placed on each device. total_size = {} # device name -> size for v in vs: size = tf.TensorShape(v.op.get_attr('shape')).num_elements() if v.device in total_size: total_size[v.device] += size else: total_size[v.device] = size for (device, allocated) in zip( sorted(total_size), [91701, 91361, 90346, 88738, 87240, 89265, 91944, 92472, 88051, 95053]): self.assertEqual(total_size[device], allocated) self.assertEqual( sum_all.device, cluster.MakeDeviceString( job_name='/job:trainer', replica_id=0, task_id=0, device_name='CPU', device_id=0))
def _TestSaveRestoreHelper(self, direction): """Test opaque params stay 'equivalent' after save-restore.""" input_dim = 4 cell_dim = 3 with tf.variable_scope('s1'): params_size_t = self._ParamsSize(input_dim, cell_dim, direction) params = tf.get_variable('cudnn_params', initializer=tf.random_uniform( [params_size_t]), validate_shape=False) reset_params_op = tf.assign(params, tf.zeros_like(params)) cur_scope_name = tf.get_variable_scope().name saveable = self._CreateSaveable(params, input_dim, cell_dim, direction, cur_scope_name) canonical_wts, canonical_bs = ( saveable.format_converter._opaque_to_cu_canonical( saveable._variables)) saver = saver_lib.Saver() with self.session(use_gpu=True) as sess: sess.run(tf.global_variables_initializer()) save_path = os.path.join(self.get_temp_dir(), 'save-restore-unidi') saver.save(sess, save_path) canonical_wts_v, canonical_bs_v = sess.run( [canonical_wts, canonical_bs]) with self.session(use_gpu=False) as sess: sess.run(tf.global_variables_initializer()) sess.run(reset_params_op) saver.restore(sess, save_path) canonical_wts_v_restored, canonical_bs_v_restored = sess.run( [canonical_wts, canonical_bs]) # Weight porition of the opaque params are exactly the same. For biases # porition, it's expected that the sum of biases each gate stays the same. self._CompareWeights(canonical_wts_v, canonical_wts_v_restored) self._CompareBiases(canonical_bs_v, canonical_bs_v_restored, direction)
def testDropoutInRecurrent(self, graph_seed): with self.session() as sess: if graph_seed: tf.random.set_seed(12345) l = lingvo_layers.DeterministicDropoutLayer.Params().Set( name='dropout', keep_prob=0.7).Instantiate() # Input variable. w = tf.get_variable('w', shape=[9, 20], initializer=tf.ones_initializer()) sess.run(tf.global_variables_initializer()) prev_sum = np.sum(np.isclose(sess.run(w), 0.0)) def Step(theta, state0, unused_inputs): w = l.FProp(theta.l, state0.w) state1 = py_utils.NestedMap(w=w) return state1, py_utils.NestedMap() acc, final = recurrent.Recurrent( theta=py_utils.NestedMap(l=l.theta), state0=py_utils.NestedMap(w=w), inputs=py_utils.NestedMap(x=tf.zeros([4])), cell_fn=Step) acc_w = sess.run(acc.w) self.assertLen(acc_w, 4) for acc_w_i in acc_w: next_sum = np.sum(np.isclose(acc_w_i, 0.0)) self.assertGreater(next_sum, prev_sum) prev_sum = next_sum # Construct loss function such that gradients = final activation. loss = tf.reduce_sum(final.w) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) w_val, grads_val = sess.run([final.w, grads.w.grad]) self.assertAllClose(w_val, grads_val)