def test_include_regex(out_dir, tf_eager_mode, workers): hook = KerasHook( out_dir=out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"], include_workers=workers, ) hook.get_collection("custom_coll").include("dense") strategy, _ = train_model(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") if tf_eager_mode: assert len(tnames) == (12 if is_tf_2_2() else 4) else: assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync for tname in tnames: assert tr.tensor(tname).value(0) is not None assert len( tr.tensor(tname).workers(0)) == (1 if workers == "one" else strategy.num_replicas_in_sync)
def test_include_regex_opt_var(out_dir, tf_eager_mode, workers): include_collections = ["custom_optimizer_variables"] save_config = SaveConfig(save_interval=3) hook = KerasHook( out_dir=out_dir, save_config=save_config, include_collections=include_collections, include_workers=workers, ) hook.get_collection("custom_optimizer_variables").include("Adam") strategy, _ = train_model(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_optimizer_variables") if tf_eager_mode: assert len(tnames) == 5 else: assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync for tname in tnames: assert tr.tensor(tname).value(0) is not None assert len( tr.tensor(tname).workers(0)) == (1 if workers == "one" else strategy.num_replicas_in_sync)
def test_collection_reductions(out_dir, tf_eager_mode): tf.keras.backend.clear_session() hook = KerasHook( out_dir=out_dir, save_config=SaveConfig(save_interval=3), include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, ], ) hook.get_collection( CollectionKeys.WEIGHTS).reduction_config = ReductionConfig( norms=["l1"]) train_model(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) tr = create_trial_fast_refresh(out_dir) weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0] try: tr.tensor(weight_name).value(0) assert False except TensorUnavailableForStep: try: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None except ValueError: # some tensors reduction can't be computed pass except TensorUnavailable: # sometimes we might not have tensor saved if it was only being # saved as reduction and the reduction computation failed pass
def test_include_regex(out_dir): hook = KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"] ) hook.get_collection("custom_coll").include("dense") train_model(out_dir, hook=hook, save_config=SaveConfig(save_interval=9), steps=["train"]) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") assert len(tnames) == 12 for tname in tnames: assert tr.tensor(tname).value(0) is not None
def test_include_regex(out_dir): hook = KerasHook( out_dir=out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"], include_workers="all", ) hook.get_collection("custom_coll").include("dense") strategy = train_model(out_dir, hook=hook, steps=["train"]) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") assert len(tnames) == 4 + 4 + 3 * strategy.num_replicas_in_sync for tname in tnames: assert tr.tensor(tname).value(0) is not None
def test_collection_reductions(out_dir): hook = KerasHook( out_dir, save_config=SaveConfig(save_interval=3), include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.GRADIENTS], ) hook.get_collection(CollectionKeys.GRADIENTS).reduction_config = ReductionConfig(norms=["l1"]) train_model(out_dir, hook=hook, steps=["train"]) tr = create_trial_fast_refresh(out_dir) weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0] grad_name = tr.tensor_names(collection=CollectionKeys.GRADIENTS)[0] assert tr.tensor(weight_name).value(0) is not None try: tr.tensor(grad_name).value(0) assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
def train_model( trial_dir, save_all=False, include_collections=None, reduction_config=None, save_config=None, use_tf_keras=True, hook=None, eager=False, use_keras_optimizer=True, create_relu_collection=False, steps=None, add_callbacks=None, ): if use_tf_keras: from tensorflow import keras else: import keras # if reset: # tf.reset_default_graph() mnist = keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 relu_layer = keras.layers.Dense(128, activation="relu") model = keras.models.Sequential([ keras.layers.Flatten(input_shape=(28, 28)), relu_layer, keras.layers.Dropout(0.2), keras.layers.Dense(10, activation="softmax"), ]) if hook is None: if save_config is None: save_config = SaveConfig(save_interval=3) hook = KerasHook( trial_dir, save_config=save_config, save_all=save_all, include_collections=include_collections, reduction_config=reduction_config, ) if not save_all and include_collections is not None: for cname in hook.include_collections: if cname not in include_collections: hook.get_collection(cname).save_config = SaveConfig( end_step=0) if create_relu_collection: hook.get_collection("relu").add_keras_layer(relu_layer, inputs=True, outputs=True) if use_keras_optimizer: opt = keras.optimizers.RMSprop() else: opt = tf.train.RMSPropOptimizer(0.1) opt = hook.wrap_optimizer(opt) if use_tf_keras: model.compile( optimizer=opt, loss="sparse_categorical_crossentropy", run_eagerly=eager, metrics=["accuracy"], ) else: model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"]) hooks = [] if add_callbacks: if "tensorboard" in add_callbacks: hooks.append( tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs", histogram_freq=1, write_grads=True, write_images=True)) if "fetch_tensor" in add_callbacks: hooks.append(FetchTensorCallback(model.outputs + model.weights)) hooks.append(hook) if steps is None: steps = ["train"] for step in steps: if step == "train": model.fit(x_train, y_train, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0) elif step == "eval": model.evaluate(x_test, y_test, steps=10, callbacks=hooks, verbose=0) elif step == "predict": model.predict(x_test[:100], callbacks=hooks, verbose=0) hook._cleanup()
def train_model( trial_dir, save_all=False, hook=None, include_collections=None, reduction_config=None, save_config=None, eager=True, strategy=None, steps=None, add_callbacks=None, include_workers="all", ): tf.keras.backend.clear_session() if not eager: tf.compat.v1.disable_eager_execution() datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True) mnist_train, mnist_test = datasets["train"], datasets["test"] if strategy is None: strategy = tf.distribute.MirroredStrategy() # You can also do info.splits.total_num_examples to get the total # number of examples in the dataset. BUFFER_SIZE = 10000 BATCH_SIZE_PER_REPLICA = 64 BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync def scale(image, label): image = tf.cast(image, tf.float32) image /= 255 return image, label train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch( BATCH_SIZE) eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE) if hook is None: if save_config is None: save_config = SaveConfig(save_interval=3) hook = KerasHook( out_dir=trial_dir, save_config=save_config, reduction_config=reduction_config, include_collections=include_collections, save_all=save_all, include_workers=include_workers, ) if not save_all and include_collections is not None: for cname in hook.include_collections: if cname not in include_collections: hook.get_collection(cname).save_config = SaveConfig( end_step=0) opt = tf.keras.optimizers.Adam() opt = hook.wrap_optimizer(opt) with strategy.scope(): relu_layer = tf.keras.layers.Dense(64, activation="relu") model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), relu_layer, tf.keras.layers.Dense(10, activation="softmax"), ]) model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) hooks = [] if add_callbacks: if "tensorboard" in add_callbacks: hooks.append( # write_grads = True causes crash saying handle must be created in scope # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg # this crash is even if callback is off tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs", histogram_freq=4, write_images=True)) hooks.append(hook) scalars_to_be_saved = dict() ts = time.time() scalars_to_be_saved["scalar/foobar"] = (ts, steps) hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) if steps is None: steps = ["train"] for step in steps: if step == "train": model.fit(train_dataset, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0) elif step == "eval": model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0) elif step == "predict": model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0) smd.get_hook().close() return strategy, scalars_to_be_saved