def setUp(self): super(TPUEmbeddingCheckpointTest, self).setUp() self.resolver = tpu_cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project) remote.connect_to_cluster(self.resolver) tpu_strategy_util.initialize_tpu_system(self.resolver) self.strategy = tpu_strategy.TPUStrategy(self.resolver) self.num_rows = self.strategy.num_replicas_in_sync # These tests use two mid level API objects, initialized with different # values. These have the same sizes. with self.strategy.scope(): self.first_mid_level_contents = np.ones((self.num_rows, 4)) self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.first_mid_level = self.build_mid_level( self.first_mid_level_contents, self.first_mid_level_optimizer) self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2 self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.second_mid_level = self.build_mid_level( self.second_mid_level_contents, self.second_mid_level_optimizer, initialize_tpu_embedding=False) self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) self.cpu_mid_level = self.build_mid_level( self.second_mid_level_contents, self.cpu_mid_level_optimizer)
def _create_strategy_and_mid_level(self, optimizer_name): strategy = self._get_strategy() with strategy.scope(): if optimizer_name == 'sgd': optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) elif optimizer_name == 'adagrad': optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1) elif optimizer_name == 'adam': optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1) elif optimizer_name == 'ftrl': optimizer = tpu_embedding_v2_utils.FTRL(learning_rate=0.1) elif optimizer_name == 'adagrad_momentum': optimizer = tpu_embedding_v2_utils.AdagradMomentum( learning_rate=0.1, momentum=0.9, use_nesterov=True, exponent=3.0, epsilon=0.1, beta2=0.9) else: raise ValueError('optimizer is not recognized: ', optimizer_name) mid_level_api = self._create_mid_level(optimizer=optimizer) return strategy, mid_level_api, optimizer
def create_mid_level(optimizer=None): # Create `TPUEmbedding` object. if optimizer is None: optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) return tpu_embedding_v2.TPUEmbedding(feature_config=feature_config, batch_size=batch_size, optimizer=optimizer)
def test_cpu_sequence_lookup_ragged(self): feature_config = ( tpu_embedding_v2_utils.FeatureConfig( table=self.table_video, name='watched', max_sequence_length=2),) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) features = self._get_ragged_tensors()[:1] result = tpu_embedding_v2.cpu_embedding_lookup( features, weights=None, tables=mid_level.embedding_tables, feature_config=feature_config) sparse_ver = features[0].to_sparse() golden = self._numpy_sequence_lookup( mid_level.embedding_tables[self.table_video].numpy(), sparse_ver.indices.numpy(), sparse_ver.values.numpy(), self.data_batch_size, feature_config[0].max_sequence_length, self.table_video.dim) self.assertAllClose(result[0], golden)
def _create_mid_level(self, optimizer=None): # Create `TPUEmbedding` object. if optimizer is None: optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) return tpu_embedding_v1.TPUEmbeddingV0( feature_config=self.feature_config, optimizer=optimizer)
def test_checkpoint_save_and_restore(self): strategy = self._get_strategy() with strategy.scope(): first_mid_level_contents = np.ones((4, 4)) first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(first_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=4, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) first_mid_level = tpu_embedding_v1.TPUEmbeddingV0( feature_config, first_mid_level_optimizer) first_mid_level.build() first_checkpoint = util.Checkpoint(model=first_mid_level) first_checkpoint.save(self._get_tmpdir('restore', 'save')) with strategy.scope(): second_mid_level_contents = np.ones((4, 4)) * 2 second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(second_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=4, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) second_mid_level = tpu_embedding_v1.TPUEmbeddingV0( feature_config, second_mid_level_optimizer) second_mid_level.build() # We restore the checkpoint of our first model into our second model. second_checkpoint = util.Checkpoint(model=second_mid_level) second_checkpoint.restore(self._get_tmpdir('restore', 'save-1')) self.assertAllClose( first_mid_level_contents, second_mid_level._variables['table']['parameters'].numpy(), msg='Second mid level api should have restored the first model values.')
def test_cpu_high_dimensional_lookup_ragged(self): feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', output_shape=[2, 2]), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_for_serving.TPUEmbeddingForServing( feature_config=feature_config, optimizer=optimizer) features = self._get_ragged_tensors()[2:3] result = mid_level(features, weights=None) self.assertAllClose(result[0].shape, (2, 2, 2))
def create_embedding(self): table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=self._rows, dim=4, initializer=self._initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) optimizer = tpu_embedding_v2_utils.SGD() self.tpu_embedding = tpu_embedding_v2.TPUEmbedding( feature_config, self._rows, optimizer)
def _create_mid_level(self, optimizer=None): # Create `TPUEmbedding` object. if optimizer is None: optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) num_replicas = ( distribution_strategy_context.get_strategy().num_replicas_in_sync) return tpu_embedding_v2.TPUEmbedding( feature_config=self.feature_config, batch_size=self.batch_size * num_replicas, optimizer=optimizer)
def test_cpu_high_dimensional_sequence_lookup_ragged(self): # Prod of output shape is a factor of the data batch size. # The divide result will be the sequence length. feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', output_shape=[2, 4]), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_for_serving.TPUEmbeddingForServing( feature_config=feature_config, optimizer=optimizer) features = self._get_ragged_tensors()[2:3] result = mid_level(features, weights=None) self.assertAllClose(result[0].shape, (2, 4, 2))
def test_cpu_multiple_creation(self): feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', max_sequence_length=2), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) embedding_one = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) embedding_two = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) # Both of the tpu embedding tables should be able to build on cpu. embedding_one.build() embedding_two.build()
def create_strategy_and_mid_level(optimizer_name): strategy = get_strategy() with strategy.scope(): if optimizer_name == 'sgd': optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) elif optimizer_name == 'adagrad': optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1) elif optimizer_name == 'adam': optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1) else: raise ValueError('optimizer is not recognized: ', optimizer_name) embedding = create_mid_level(optimizer=optimizer) return strategy, embedding, optimizer
def test_cpu_high_dimensional_invalid_lookup_ragged(self): # Prod of output shape is not a factor of the data batch size. # An error will be raised in this case. feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', output_shape=[3]), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_for_serving.TPUEmbeddingForServing( feature_config=feature_config, optimizer=optimizer) features = self._get_ragged_tensors()[2:3] with self.assertRaisesRegex( ValueError, 'Output shape set in the FeatureConfig should be the factor'): mid_level(features, weights=None)
def test_cpu_sequence_lookup(self): feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_video, name='watched', max_sequence_length=2), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) features = tuple(self._get_sparse_tensors()[:1]) with self.assertRaisesRegex( ValueError, 'Sequence features unsupported at this time.'): tpu_embedding_v2.cpu_embedding_lookup( features, weights=None, tables=mid_level.embedding_tables, feature_config=feature_config)
def test_cpu_sequence_lookup_sparse(self): feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', max_sequence_length=2), ) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level = tpu_embedding_for_serving.TPUEmbeddingForServing( feature_config=feature_config, optimizer=optimizer) features = self._get_sparse_tensors()[2:3] result = mid_level(features, weights=None) golden = self._numpy_sequence_lookup( mid_level.embedding_tables[self.table_user].numpy(), features[0].indices.numpy(), features[0].values.numpy(), self.data_batch_size, feature_config[0].max_sequence_length, self.table_user.dim) self.assertAllClose(result[0], golden)
def _create_strategy_and_mid_level(self, optimizer_name): strategy = self._get_strategy() with strategy.scope(): if optimizer_name == 'sgd': optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) elif optimizer_name == 'adagrad': optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1) elif optimizer_name == 'adam': optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1) elif optimizer_name == 'ftrl': optimizer = tpu_embedding_v2_utils.FTRL(learning_rate=0.1) else: raise ValueError('optimizer is not recognized: ', optimizer_name) mid_level_api = self._create_mid_level(optimizer=optimizer) return strategy, mid_level_api, optimizer
def test_multiple_creation(self): feature_config = tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', max_sequence_length=2) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) strategy = self._get_strategy() with strategy.scope(): embedding_one = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) embedding_two = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) # The first TPU embedding should be able to be built. # The second one should fail with a runtime error indicating another TPU # embedding has already been initialized on TPU. embedding_one.build(64) with self.assertRaisesRegex( RuntimeError, 'TPU is already initialized for embeddings.'): embedding_two.build(64)
def test_tables_with_same_name(self): with self.assertRaisesRegex( ValueError, 'Multiple tables with name table found.'): with self._get_strategy().scope(): tpu_embedding_v2.TPUEmbedding( (tpu_embedding_v2_utils.FeatureConfig( table=tpu_embedding_v2_utils.TableConfig( name='table', vocabulary_size=4, dim=2, initializer=self.initializer,), name='watched'), tpu_embedding_v2_utils.FeatureConfig( table=tpu_embedding_v2_utils.TableConfig( name='table', vocabulary_size=4, dim=2, initializer=self.initializer), name='favorited')), tpu_embedding_v2_utils.SGD(learning_rate=0.1))
def test_missing_feature(self, is_sparse): strategy = self._get_strategy() with strategy.scope(): optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) mid_level_api = tpu_embedding_v2.TPUEmbedding( feature_config=tpu_embedding_v2_utils.FeatureConfig( table=self.table_video, name='watched'), optimizer=optimizer) # Create sparse or ragged feature with last sample missing. if is_sparse: features = sparse_tensor.SparseTensor( indices=self.feature_watched_indices[:-1], values=self.feature_watched_values[:-1], dense_shape=[self.data_batch_size, 2]) else: features = ragged_tensor.RaggedTensor.from_row_lengths( row_lengths=[1, 2, 2, 0], values=self.feature_watched_values[:-1]) dataset = dataset_ops.DatasetV2.from_tensors(features) dataset = dataset.unbatch().repeat().batch( self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True) dataset_iter = iter( strategy.experimental_distribute_dataset( dataset, options=distribute_lib.InputOptions( experimental_fetch_to_device=False))) @def_function.function def test_fn(): def get_activations(): return mid_level_api.dequeue() mid_level_api.enqueue(next(dataset_iter), training=False) return strategy.run(get_activations) test_fn()
def _create_strategy_and_mid_level(self, optimizer_name): strategy = self._get_strategy() # Keras optimizers has to be translated to embedding optimizer with slot # variable creation fn properly populated. with strategy.scope(): if optimizer_name == 'sgd': optimizer = optimizer_v2.gradient_descent.SGD( learning_rate=0.1) embedding_optimizer = tpu_embedding_v2_utils.SGD( learning_rate=0.1) elif optimizer_name == 'adagrad': optimizer = optimizer_v2.adagrad.Adagrad(learning_rate=0.1) embedding_optimizer = tpu_embedding_v2_utils.Adagrad( learning_rate=0.1, slot_variable_creation_fn=self. _get_slot_variable_creation_fn(optimizer)) elif optimizer_name == 'adam': optimizer = optimizer_v2.adam.Adam(learning_rate=0.1) embedding_optimizer = tpu_embedding_v2_utils.Adam( learning_rate=0.1, slot_variable_creation_fn=self. _get_slot_variable_creation_fn(optimizer)) elif optimizer_name == 'ftrl': optimizer = optimizer_v2.ftrl.Ftrl(learning_rate=0.1) embedding_optimizer = tpu_embedding_v2_utils.FTRL( learning_rate=0.1, slot_variable_creation_fn=self. _get_slot_variable_creation_fn(optimizer)) else: raise ValueError('optimizer is not recognized: ', optimizer_name) mid_level_api = self._create_mid_level( optimizer=embedding_optimizer) return strategy, mid_level_api, optimizer
def _create_mid_level(self): optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) return tpu_embedding_v2.TPUEmbedding( feature_config=self.feature_config, optimizer=optimizer)
def test_checkpoint_restore_loads(self): strategy = self._get_strategy() num_rows = strategy.num_replicas_in_sync def get_values(mid): return ops.convert_to_tensor( mid._variables['table']['parameters'].variables[0]) with strategy.scope(): first_mid_level_contents = np.ones((num_rows, 4)) first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(first_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=num_rows, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) first_mid_level = tpu_embedding_v2.TPUEmbedding( feature_config, first_mid_level_optimizer) first_mid_level.build(64) first_mid_level._load_variables() first_checkpoint = util.Checkpoint(model=first_mid_level) first_checkpoint.save(self._get_tmpdir('restore', 'save')) tpu_strategy_util.initialize_tpu_system(self.resolver) with strategy.scope(): second_mid_level_contents = np.ones((num_rows, 4)) * 2 second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(second_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=num_rows, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) second_mid_level = tpu_embedding_v2.TPUEmbedding( feature_config, second_mid_level_optimizer) second_mid_level.build(64) second_mid_level._load_variables() self.assertAllClose( second_mid_level_contents, get_values(second_mid_level), msg='Second mid level api should contain its initial values.', ) # We restore the checkpoint of our first model into our second model. # This should load the first mid level API object onto the TPU. second_checkpoint = util.Checkpoint(model=second_mid_level) second_checkpoint.restore(self._get_tmpdir('restore', 'save-1')) # Call retrieve here as a way to check what the TPU contains. # Calling the retrieve ops directly might make for a cleaner separation of # test and module, though. second_mid_level._retrieve_variables() self.assertAllClose( first_mid_level_contents, get_values(second_mid_level), msg='Second mid level api should have retrieved the first model values.' )
def test_checkpoint_save_retrieves(self): strategy = self._get_strategy() num_rows = strategy.num_replicas_in_sync with strategy.scope(): first_mid_level_contents = np.ones((num_rows, 4)) first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(first_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=num_rows, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) first_mid_level = tpu_embedding_v2.TPUEmbedding( feature_config, first_mid_level_optimizer) first_mid_level.build(64) # Ensure that the variables from the first model are loaded. first_mid_level._load_variables() self.assertAllClose( first_mid_level_contents, self.make_checkpoint_and_get_embedding('before_load', first_mid_level, num_rows), msg='Checkpoint should contain values from the first api object.') # Reinitialize the tpu. tpu_strategy_util.initialize_tpu_system(self.resolver) with strategy.scope(): second_mid_level_contents = np.ones((num_rows, 4)) * 2 second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(second_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=num_rows, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) second_mid_level = tpu_embedding_v2.TPUEmbedding( feature_config, second_mid_level_optimizer) second_mid_level.build(64) second_mid_level._load_variables() # When we load the variables from the second mid level API object to the TPU # we expect that checkpointing the first mid level API object will now # retrieve the values from the TPU which are now different from the current # variables in the first mid level. self.assertAllClose( second_mid_level_contents, self.make_checkpoint_and_get_embedding('after_load', first_mid_level, num_rows), msg='Checkpoint should contain values from the second api object.')
def test_model_export_cpu(self): strategy = self._get_strategy() with strategy.scope(): first_mid_level_contents = np.ones((4, 4)) first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) initializer = init_ops_v2.Constant(first_mid_level_contents) table = tpu_embedding_v2_utils.TableConfig( vocabulary_size=4, dim=4, initializer=initializer, combiner='sum', name='table') feature_config = (tpu_embedding_v2_utils.FeatureConfig( table=table, name='feature'),) first_mid_level = tpu_embedding_v1.TPUEmbeddingV0( feature_config, first_mid_level_optimizer) first_mid_level.build() cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) cpu_mid_level = tpu_embedding_for_serving.TPUEmbeddingForServing( feature_config, cpu_mid_level_optimizer) cpu_mid_level.build() tpu_checkpoint = util.Checkpoint(model=first_mid_level) tpu_checkpoint.save(self._get_tmpdir('export_cpu', 'save')) # We restore the checkpoint of our tpu mid level onto our cpu mid level. cpu_checkpoint = util.Checkpoint(model=cpu_mid_level) cpu_checkpoint.restore(self._get_tmpdir('export_cpu', 'save-1')) @def_function.function def serve_tensors(features): features = tpu_embedding_for_serving.cpu_embedding_lookup( features, None, cpu_mid_level.embedding_tables, cpu_mid_level._feature_config) return features[0] signatures = { 'serving_default': serve_tensors.get_concrete_function((tensor_spec.TensorSpec( shape=(2,), dtype=dtypes.int32, name='feature'),)) } save.save( cpu_mid_level, export_dir=self._get_tmpdir('export_cpu', 'exported_model'), signatures=signatures) imported = load.load(self._get_tmpdir('export_cpu', 'exported_model')) predict_fn = imported.signatures['serving_default'] input_feature_value = np.array([1, 0]) input_batch = (constant_op.constant( input_feature_value, dtype=dtypes.int32),) prediction = predict_fn(*input_batch)['output_0'] self.assertAllClose(prediction.numpy(), first_mid_level_contents[input_feature_value])
def test_sequence_embeddings(self, sparse): feature_config = ( tpu_embedding_v2_utils.FeatureConfig( table=self.table_video, name='watched', max_sequence_length=2), tpu_embedding_v2_utils.FeatureConfig( table=self.table_video, name='favorited', max_sequence_length=2), tpu_embedding_v2_utils.FeatureConfig( table=self.table_user, name='friends', max_sequence_length=3)) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1) strategy = self._get_strategy() num_replicas = strategy.num_replicas_in_sync with strategy.scope(): mid_level = tpu_embedding_v2.TPUEmbedding( feature_config=feature_config, optimizer=optimizer) # Call build here. We call 'next' outside of the tf.function and this # results in data where the shape of the sparse tensor is a tensor which we # can't tell the shape of at tracing time. mid_level.build(self.batch_size) if sparse: dataset = self._create_sparse_dataset(strategy) else: dataset = self._create_ragged_dataset(strategy) data = next( iter( strategy.experimental_distribute_dataset( dataset, options=distribute_lib.InputOptions( experimental_fetch_to_device=False)))) @def_function.function def embedding_and_set_gradients(data): def tpu_fn(): activations = mid_level.dequeue() mid_level.apply_gradients(nest.map_structure(array_ops.ones_like, activations)) return activations mid_level.enqueue(data) return strategy.run(tpu_fn) @def_function.function def embedding_only(data): def tpu_fn(): return mid_level.dequeue() mid_level.enqueue(data) return strategy.run(tpu_fn) # Only check core 0. before_update = self._get_replica_numpy( embedding_and_set_gradients(data), strategy, 0) after_update = self._get_replica_numpy(embedding_only(data), strategy, 0) # For videos table, row 0 and row 1 are looked up 3*num_replicas times as # they occur 3 times per replica (considering the features 0 and 1 which are # both looked up in the videos table). # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas # This means that both rows 0 and 1 get a -0.1*3*num_replicas update # For users table, each row is looked up twice: # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas # This means that we get a -0.1*num_replicas update to the third feature. # In general this means that after the update, if we lookup feature 0 and 1 # the values will be 0.3*num_replicas lower per entry and for feature 2 they # will be 0.1*num_replicas lower. # The one issue is that these lookups contain padding values. # For core 0, we get the first 2 elements of the 4 element batch. # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence # length of 2, which means that [0, 1] will be 0s. # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence # length of 2, which means that [1, 1] will be 0s. # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max # sequence length of 3, which means that [0, 1], [0, 2] will be 0s. # The following masks represent that so that we only apply the above updates # to the non-padding rows: masks = ( np.array([[[1], [0]], [[1], [1]]]), np.array([[[1], [1]], [[1], [0]]]), np.array([[[1], [0], [0]], [[1], [1], [1]]])) per_row_update = (0.3 * num_replicas, 0.3 * num_replicas, 0.1 * num_replicas) golden = tuple([before - update * mask for before, update, mask in zip(before_update, per_row_update, masks)]) self.assertAllClose(golden, after_update)
def test_variable_learning_rate(self): num_steps = 10 num_steps_float = float(num_steps) starting_lr = 1.0 ending_lr = 0.5 strategy = self._get_strategy() num_replicas = strategy.num_replicas_in_sync # Create model with Keras. with strategy.scope(): step_counter = tf_variables.Variable(0.0, dtypes.float32) def lr_function(): return gen_math_ops.maximum( ending_lr, starting_lr + ((ending_lr - starting_lr) * step_counter) / num_steps_float) optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function) table_config = tpu_embedding_v2_utils.TableConfig( vocabulary_size=num_replicas, dim=4, initializer=init_ops_v2.Constant(np.zeros((num_replicas, 4))), combiner='sum', name='table') mid_level_api = tpu_embedding_v2.TPUEmbedding( feature_config={ 'feature': tpu_embedding_v2_utils.FeatureConfig( table=table_config, name='feature')}, optimizer=optimizer) feature = { 'feature': constant_op.constant([0], shape=(1, 1), dtype=dtypes.int32) } def input_fn(ctx): del ctx return dataset_ops.DatasetV2.from_tensors(feature).repeat() dist = strategy.distribute_datasets_from_function( input_fn, options=distribute_lib.InputOptions(experimental_fetch_to_device=False)) dist_iter = iter(dist) @def_function.function def test_fn(): def step(): with backprop.GradientTape() as tape: activations = mid_level_api.dequeue() tape.watch(activations) result = math_ops.reduce_sum(activations['feature']) loss = result / num_replicas grads = tape.gradient(loss, activations) mid_level_api.apply_gradients(grads) return activations['feature'] mid_level_api.enqueue(next(dist_iter), training=True) return strategy.run(step) # Run model. results = [] for _ in range(num_steps): result = test_fn() results.append(self._unpack(strategy, result)) step_counter.assign_add(1.0) # Table is 2 elements wide, per-replica batch size of 1, with id 0. # Loss for the gradient is the sum of the entries divided by the number of # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no # other updates. The reduced gradient is therefore 1. # Learning rate schedule over num_steps steps: # 1.0 0.95 0.9 0.85 0.8 ... # Since use SGD and the gradient is one, the first row of the table is # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative # partial sums of the above). learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j for j in range(num_steps)] cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)] goldens = [[[-cumsum[i]] * table_config.dim] * num_replicas for i in range(10)] self.assertAllClose(results, goldens)