def setUpClass(cls): super().setUpClass() cls.strategy = tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 2), variable_partitioner=tf.distribute.experimental.partitioners. FixedShardsPartitioner(2), )
def _model_compile(self, strategy, steps_per_execution=1, run_eagerly=False, with_normalization_layer=False, use_lookup_layer=False): class ResultAssertingCallback(callbacks_lib.Callback): """A callback that asserts the result of the tests.""" def __init__(self): self._prev_epoch = -1 def on_epoch_end(self, epoch, logs=None): logging.info("testModelFit: epoch=%r, logs=%r", epoch, logs) if epoch <= self._prev_epoch: raise RuntimeError( "Epoch is supposed to be larger than previous.") self._prev_epoch = epoch is_loss_float = (logs.get("loss", None) is not None and isinstance(logs["loss"], (float, np.floating))) if not is_loss_float: raise RuntimeError( "loss is supposed to be in the logs and float.") def on_train_end(self, logs=None): if self._prev_epoch != 9: raise RuntimeError("Unexpected last epoch: {}".format( self._prev_epoch)) # TODO(b/182193218): Use ParameterServerStrategy as a proper strategy # combination. if strategy == "ParameterServerStrategy": gpu_devices = tf.config.list_physical_devices("GPU") if len(gpu_devices) > 1: self.skipTest("b/178452835: Multi-GPUs not supported in " "ParameterServerStrategy.") strategy = tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 2), variable_partitioner=tf.distribute.experimental.partitioners. FixedShardsPartitioner(2)) with strategy.scope(): model = sequential.Sequential([core_layers.Dense(10)]) if with_normalization_layer: norm = keras.layers.BatchNormalization(axis=-1, input_shape=(4, 4, 3), momentum=0.8) model.add(norm) model.add(core_layers.Dense(1, activation="sigmoid")) self._metric = keras.metrics.Accuracy() model.compile(gradient_descent.SGD(), loss="binary_crossentropy", metrics=[self._metric], steps_per_execution=steps_per_execution, run_eagerly=run_eagerly) return model, [ResultAssertingCallback()]
def test_slot_variable_checkpoint_load_with_diff_shards(self): with self.strategy.scope(): # Set a name so the ShardedVariable is well-named for slot var # keying var = tf.Variable([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="test") opt = keras.optimizers.optimizer_v2.adam.Adam() # Run once to trigger apply_gradients to populate optimizer slot # variables. def train_step(): with tf.GradientTape() as tape: loss = sum(var) opt.minimize(loss, var.variables, tape=tape) self.strategy.run(train_step) # Check that we can call get_slot using each slot, before and after # Checkpointing, and get the same results pre_ckpt_slots = [] for slot in opt.get_slot_names(): pre_ckpt_slots.extend( tf.concat(list(opt.get_slot(var, slot)), axis=0).numpy()) ckpt = tf.train.Checkpoint(var=var, opt=opt) saved_dir = self.get_temp_dir() ckpt_prefix = f"{saved_dir}/ckpt" ckpt.save(ckpt_prefix) # Create new strategy with different number of shards strategy2 = tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 2), variable_partitioner=tf.distribute.experimental.partitioners. FixedShardsPartitioner( # noqa: E501 3), ) # Create new variable with different values, to be overwritten by ckpt. with strategy2.scope(): var = tf.Variable([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], name="test") opt = keras.optimizers.optimizer_v2.adam.Adam() # Run once to trigger apply_gradients to populate optimizer slot # variables. strategy2.run(train_step) new_ckpt = tf.train.Checkpoint(var=var, opt=opt) new_ckpt.restore(tf.train.latest_checkpoint(saved_dir)) post_ckpt_slots = [] for slot in new_ckpt.opt.get_slot_names(): post_ckpt_slots.extend( tf.concat(list(new_ckpt.opt.get_slot(var, slot)), axis=0).numpy()) self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
def test_saved_model_min_size_partitioner(self): # set min_shard_bytes such that Dense kernel is split into 2 and bias # into 1 partitioner = ( tf.distribute.experimental.partitioners.MinSizePartitioner( min_shard_bytes=(6 * 6 * 4) // 2, max_shards=2)) cluster_resolver = ( multi_worker_testing_utils.make_parameter_server_cluster(3, 2)) strategy = tf.distribute.experimental.ParameterServerStrategy( cluster_resolver, variable_partitioner=partitioner) def create_dense_model(): inputs = keras.layers.Input(shape=(6, )) outputs = keras.layers.Dense(6)(inputs) model = keras.Model(inputs, outputs) model.compile(optimizer="adam", loss="mean_squared_error") return model x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32) with strategy.scope(): model = create_dense_model() expect = model(x) # 2 kernel variables, 1 bias self.assertLen(model.variables, 3) saved_dir = self.get_temp_dir() model.save(saved_dir) # set min_shard_bytes such that Dense kernel is split into 3 and bias # into 1 partitioner2 = ( tf.distribute.experimental.partitioners.MinSizePartitioner( min_shard_bytes=(6 * 6 * 4) // 3, max_shards=3)) strategy2 = tf.distribute.experimental.ParameterServerStrategy( cluster_resolver, variable_partitioner=partitioner2) with strategy2.scope(): loaded_model = keras.models.load_model(saved_dir) got = loaded_model(x) self.assertAllClose(got, expect) # 3 kernel variables, 1 bias self.assertLen(loaded_model.variables, 4)
def make_coordinator(num_workers, num_ps, variable_partitioner=None): return tf.distribute.experimental.coordinator.ClusterCoordinator( tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster( num_workers, num_ps), variable_partitioner=variable_partitioner))
def setUpClass(cls): super(KPLCreatedInDatasetsFromFunctionTest, cls).setUpClass() cls.coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator( tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 2)))
def test_saved_model_combined(self, shard_config, model_type): """Test saving and loading models with various fixed numbers of shards. Args: shard_config: The number of shards to use per variable before and after loading. For example, [1, 3] means to create and save the model with 1 shard (i.e., no variable partitioning), and load it into 3 shards per variable. model_type: Either 'dense' or 'embedding', which simple model to test. """ def create_embedding_model(): inputs = keras.layers.Input(shape=(6, )) embedding = keras.layers.Embedding(output_dim=2, input_dim=6) outputs = embedding(inputs) model = keras.Model(inputs, outputs) model.compile(optimizer="adam", loss="mean_squared_error") return model def create_dense_model(): inputs = keras.layers.Input(shape=(6, )) outputs = keras.layers.Dense(6)(inputs) model = keras.Model(inputs, outputs) model.compile(optimizer="adam", loss="mean_squared_error") return model # Maybe create new strategy with different number of shards if shard_config[0] > 2: strategy = tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 3), variable_partitioner=tf.distribute.experimental.partitioners. FixedShardsPartitioner( # noqa: E501 shard_config[0]), ) elif shard_config[0] == 2: strategy = self.strategy else: # Just one shard, so use default strategy strategy = tf.distribute.get_strategy() x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32) with strategy.scope(): model = (create_dense_model() if model_type == "dense" else create_embedding_model()) expect = model(x) # Dense layers have two variables (kernel and bias), embedding layers # have 1 n_expected_variables = shard_config[0] * (2 if model_type == "dense" else 1) self.assertLen(model.variables, n_expected_variables) model_weights = [v.numpy() for v in model.variables] saved_dir = self.get_temp_dir() model.save(saved_dir) if shard_config[1] > 2: strategy2 = tf.distribute.experimental.ParameterServerStrategy( multi_worker_testing_utils.make_parameter_server_cluster(3, 3), variable_partitioner=tf.distribute.experimental.partitioners. FixedShardsPartitioner( # noqa: E501 shard_config[1]), ) elif shard_config[1] == 2: strategy2 = self.strategy else: # Just one shard, so use default strategy strategy2 = tf.distribute.get_strategy() with strategy2.scope(): loaded_model = keras.models.load_model(saved_dir) got = loaded_model(x) self.assertAllClose(got, expect) n_expected_variables = shard_config[1] * (2 if model_type == "dense" else 1) self.assertLen(loaded_model.variables, n_expected_variables) loaded_model_weights = [v.numpy() for v in loaded_model.variables] self.assertAllClose( np.concatenate([w.flatten() for w in model_weights]), np.concatenate([w.flatten() for w in loaded_model_weights]), )