def test_new_graph(out_dir): # tests that we can correctly interpret an explicitly created graph g1 = tf.get_default_graph() g = tf.Graph() with g.as_default(): assert g != g1 assert g == tf.get_default_graph() hook = smd.SessionHook( out_dir, include_collections=["weights", "losses", "scalars"], save_config=smd.SaveConfig(save_steps=[0, 1, 2, 3]), ) with tf.name_scope("foobar"): x = tf.placeholder(shape=(None, 2), dtype=tf.float32) w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1") with tf.name_scope("foobaz"): w0 = [[1], [1.0]] y = tf.matmul(x, w0) loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss") hook.get_collection("losses").add(loss) global_step = tf.Variable(17, name="global_step", trainable=False) increment_global_step_op = tf.assign(global_step, global_step + 1) optimizer = tf.train.AdamOptimizer(0.1) optimizer = hook.wrap_optimizer(optimizer) optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op) sess = tf.train.MonitoredSession(hooks=[hook]) for i in range(5): x_ = np.random.random((10, 2)) * 0.1 sess.run([loss, optimizer_op, increment_global_step_op], {x: x_}) sess.close() tr = create_trial(out_dir) assert len(tr.tensor_names())
def main(): parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--epoch", type=int, default=3) parser.add_argument("--model_dir", type=str, default="./model_keras_resnet") parser.add_argument("--out_dir", type=str) parser.add_argument("--save_interval", type=int, default=500) opt = parser.parse_args() model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10) ##### Enabling SageMaker Debugger ########### # creating hook hook = smd.KerasHook( out_dir=opt.out_dir, include_collections=["weights", "gradients", "losses"], save_config=smd.SaveConfig(save_interval=opt.save_interval), ) optimizer = tf.keras.optimizers.Adam() ##### Enabling SageMaker Debugger ########### # wrap the optimizer so the hook can identify the gradients optimizer = hook.wrap_optimizer(optimizer) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) # start the training. train(opt.batch_size, opt.epoch, model, hook)
def test_subclassed_model(out_dir): # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data("MNIST-data") x_train, x_test = x_train / 255.0, x_test / 255.0 # Add a channels dimension x_train = x_train[..., tf.newaxis] x_test = x_test[..., tf.newaxis] # Create an instance of the model model = MyModel() train_ds = ( tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000, seed=123).batch(2) ) MyModel.hook = smd.KerasHook( out_dir, save_all=True, save_config=smd.SaveConfig(save_steps=[x for x in range(10)], save_interval=1), ) MyModel.hook.register_model(model) model.compile(optimizer="Adam", loss="mse", run_eagerly=True) model.fit(train_ds, epochs=1, steps_per_epoch=10, callbacks=[MyModel.hook]) trial = smd.create_trial(out_dir) assert len(trial.tensor_names(collection=smd.CollectionKeys.LAYERS)) == 8 assert trial.tensor_names(collection=smd.CollectionKeys.INPUTS) == ["model_input"] assert trial.tensor_names(collection=smd.CollectionKeys.OUTPUTS) == ["labels", "predictions"] assert trial.tensor_names(collection=smd.CollectionKeys.LOSSES) == ["loss"] assert len(trial.tensor_names(collection=smd.CollectionKeys.GRADIENTS)) == 6
def test_mnist(out_dir, on_s3=False): if on_s3: run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") bucket = "smdebug-testing" prefix = "outputs/hooks/estimator_modes/" + run_id out_dir = f"s3://{bucket}/{prefix}" help_test_mnist(out_dir, save_config=smd.SaveConfig(save_interval=2), num_steps=2, steps=None) helper_test_mnist_trial(out_dir)
def test_shapes(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig(save_shape=True, save_raw_tensor=save_raw_tensor) hook = smd.SessionHook( out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1), reduction_config=rdnc, include_collections=["weights", "gradients", "losses"], ) simple_model(hook) verify_shapes(out_dir, 0)
def test_mode_changes(out_dir): help_test_mnist( out_dir, save_config=smd.SaveConfig(save_interval=2), num_steps=2, steps=["train", "eval", "train", "eval", "train", "train"], ) tr = create_trial(out_dir) print(tr.steps(), tr.steps(mode=smd.modes.TRAIN), tr.steps(mode=smd.modes.EVAL)) assert len(tr.steps()) == 6 assert len(tr.steps(mode=smd.modes.TRAIN)) == 4 assert len(tr.steps(mode=smd.modes.EVAL)) == 2 assert len(tr.tensor_names()) == 13
def test_mnist_shapes(out_dir, on_s3=False): if on_s3: run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") bucket = "smdebug-testing" prefix = "outputs/hooks/estimator_modes/" + run_id out_dir = f"s3://{bucket}/{prefix}" help_test_mnist( out_dir, save_all=True, save_config=smd.SaveConfig(save_steps=[0]), num_steps=1, steps=None, reduction_config=smd.ReductionConfig(save_shape=True), ) verify_shapes(out_dir, 0)
def test_reductions(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig( reductions=ALLOWED_REDUCTIONS, abs_reductions=ALLOWED_REDUCTIONS, norms=ALLOWED_NORMS, abs_norms=ALLOWED_NORMS, save_raw_tensor=save_raw_tensor, ) hook = smd.SessionHook( out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1), reduction_config=rdnc, include_collections=["weights", "gradients", "losses"], ) helper_test_reductions(out_dir, hook, save_raw_tensor)
def test_multiple_inputs(out_dir): my_model = MyModel() hook = smd.KerasHook( out_dir, save_all=True, save_config=smd.SaveConfig(save_steps=[0], save_interval=1) ) hook.register_model(my_model) x_train = np.random.random((1000, 20)) y_train = np.random.random((1000, 1)) my_model.compile(optimizer="Adam", loss="mse", run_eagerly=True) my_model.fit(x_train, y_train, epochs=1, steps_per_epoch=1, callbacks=[hook]) trial = create_trial(path=out_dir) tnames = sorted(trial.tensor_names(collection=smd.CollectionKeys.LAYERS)) assert "concatenate" in tnames[0] assert len(trial.tensor(tnames[0]).value(0)) == 2 assert trial.tensor(tnames[0]).shape(0) == (2, 1000, 20)
def test_mnist_local_multi_save_configs(out_dir, on_s3=False): # Runs in 0:04 if on_s3: run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") bucket = "smdebug-testing" prefix = "outputs/hooks/estimator_modes/" + run_id out_dir = f"s3://{bucket}/{prefix}" help_test_mnist( out_dir, smd.SaveConfig({ smd.modes.TRAIN: smd.SaveConfigMode(save_interval=2), smd.modes.EVAL: smd.SaveConfigMode(save_interval=3), }), include_collections=["losses"], num_steps=3, ) helper_test_multi_save_configs_trial(out_dir)
def test_functional_model(out_dir, tf_eager_mode): if tf_eager_mode is False: tf.compat.v1.disable_eager_execution() else: return num_classes = 10 train_ds, test_ds = create_dataset() # Input image dimensions img_rows, img_cols = 28, 28 img_inputs = Input(shape=(28, 28, 1)) x = Conv2D(32, kernel_size=(3, 3), activation="relu")(img_inputs) x1 = Conv2D(64, (3, 3), activation="relu")(x) x = MaxPooling2D(pool_size=(2, 2))(x1) x = Dropout(0.25)(x) x = Flatten()(x) x = Dense(128, activation="relu")(x) x = Dropout(0.5)(x) out = Dense(num_classes, activation="softmax")(x) model = tf.keras.models.Model(inputs=img_inputs, outputs=out) smd_callback = smd.KerasHook(export_tensorboard=False, out_dir=out_dir, include_collections=["custom"]) smd_callback.get_collection("custom").add_for_mode([x1], mode=smd.modes.TRAIN) smd_callback.save_config = smd.SaveConfig(save_interval=1) opt = tf.keras.optimizers.Adadelta(1.0) model.compile( loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=opt, experimental_run_tf_function=False, ) callbacks = [smd_callback] model.fit(train_ds, epochs=1, steps_per_epoch=100, callbacks=callbacks) trial = smd.create_trial(out_dir) assert len(trial.tensor_names(collection="custom")) == 1
def main(): parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--epoch", type=int, default=3) parser.add_argument("--model_dir", type=str, default="./model_keras_resnet") parser.add_argument("--out_dir", type=str) parser.add_argument("--save_interval", type=int, default=500) opt = parser.parse_args() model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10) ##### Enabling SageMaker Debugger ########### # creating hook hook = smd.KerasHook( out_dir=opt.out_dir, # Information on default collections https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#default-collections-saved include_collections=[ "weights", "biases", "default", "gradients", "optimizer_variables", "outputs", ], save_config=smd.SaveConfig(save_interval=opt.save_interval), ) optimizer = tf.keras.optimizers.Adam() ##### Enabling SageMaker Debugger ########### # wrap the optimizer so the hook can identify the gradients model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) # start the training. train(opt.batch_size, opt.epoch, model, hook)
def main(): parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--epoch", type=int, default=3) parser.add_argument("--model_dir", type=str, default="./model_keras_resnet") parser.add_argument("--out_dir", type=str) parser.add_argument("--save_interval", type=int, default=500) opt = parser.parse_args() model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10) ##### Enabling SageMaker Debugger ########### # creating hook hook = smd.KerasHook( out_dir=opt.out_dir, # Information on default collections https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#default-collections-saved include_collections=["weights", "biases", "default", "gradients"], save_config=smd.SaveConfig(save_interval=opt.save_interval), ) # start the training. train(opt.batch_size, opt.epoch, model, hook)
print('Not properly initialized...') optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) if use_amp: # loss scaling is currently required when using mixed precision optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, 'dynamic') callbacks = [] if enable_sagemaker_debugger: import smdebug.tensorflow as smd callback = smd.KerasHook( out_dir=output_data_dir, export_tensorboard=True, tensorboard_dir=tensorboard_logs_path, save_config=smd.SaveConfig(save_interval=100), # save_all=True, include_collections=['metrics', 'losses', 'sm_metrics'], include_workers='all') callbacks.append(callback) optimizer = callback.wrap_optimizer(optimizer) else: callback = tf.keras.callbacks.TensorBoard( log_dir=tensorboard_logs_path) callbacks.append(callback) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) print('Trained model {}'.format(model))
def main(args): # Horovod: initialize Horovod. hvd.init() if not args.use_only_cpu: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) else: config = None K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(args.num_epochs / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == "channels_first": x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print("x_train shape:", x_train.shape) print(x_train.shape[0], "train samples") print(x_test.shape[0], "test samples") # Convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape)) model.add(Conv2D(64, (3, 3), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation="softmax")) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adadelta(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) ##### Enabling SageMaker Debugger ########### # creating hook smd_hook = smd.KerasHook( out_dir=args.out_dir, save_config=smd.SaveConfig(save_interval=args.save_interval), include_collections=["weights", "gradients"], include_workers=args.include_workers, ) ##### Enabling SageMaker Debugger ########### # wrapping optimizer so hook can identify gradients opt = smd_hook.wrap_optimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=["accuracy"]) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ##### Enabling SageMaker Debugger ########### # adding smd hook as a callback smd_hook, ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint( os.path.join(args.model_dir, "checkpoint-{epoch}.h5"))) model.fit( x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1 if hvd.rank() == 0 else 0, validation_data=(x_test, y_test), ) score = model.evaluate(x_test, y_test, verbose=0) print("Test loss:", score[0]) print("Test accuracy:", score[1])
def test_mnist_local(out_dir): help_test_mnist(out_dir, smd.SaveConfig(save_interval=2), num_steps=2) tr = create_trial(out_dir) assert len(tr.collection("losses").tensor_names) == 1 for t in tr.collection("losses").tensor_names: assert len(tr.tensor(t).steps()) == 3
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--random_seed", type=bool, default=False) parser.add_argument("--out_dir", type=str) parser.add_argument("--save_interval", type=int, default=500) parser.add_argument("--num_epochs", type=int, default=5, help="Number of epochs to train for") parser.add_argument( "--num_steps", type=int, help= "Number of steps to train for. If this is passed, it overrides num_epochs", ) parser.add_argument( "--num_eval_steps", type=int, help="Number of steps to evaluate for. If this" "is passed, it doesnt evaluate over the full eval set", ) parser.add_argument("--model_dir", type=str, default="/tmp/mnist_model") args = parser.parse_args() if args.random_seed: tf.set_random_seed(2) np.random.seed(2) random.seed(12) ##### Enabling SageMaker Debugger ########### # creating hook hook = smd.EstimatorHook( out_dir=args.out_dir, include_collections=["weights", "gradients"], save_config=smd.SaveConfig(save_interval=args.save_interval), ) def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 conv1 = tf.layers.conv2d( inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu, ) # Pooling Layer #1 pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 and Pooling Layer #2 conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Dense Layer pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # Logits Layer logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer( learning_rate=args.lr) ##### Enabling SageMaker Debugger ########### # Wrap your optimizer as follows to help SageMaker Debugger identify gradients # This does not change your optimization logic, it returns back the same optimizer optimizer = hook.wrap_optimizer(optimizer) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) # Load training and eval data ((train_data, train_labels), (eval_data, eval_labels)) = tf.keras.datasets.mnist.load_data() train_data = train_data / np.float32(255) train_labels = train_labels.astype(np.int32) # not required eval_data = eval_data / np.float32(255) eval_labels = eval_labels.astype(np.int32) # not required mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir=args.model_dir) train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=128, num_epochs=args.num_epochs, shuffle=True, ) eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) ##### Enabling SageMaker Debugger ########### # Set training mode so SMDebug can classify the steps into training mode hook.set_mode(smd.modes.TRAIN) ##### Enabling SageMaker Debugger ########### # pass hook to hooks parameter of train method mnist_classifier.train(input_fn=train_input_fn, steps=args.num_steps, hooks=[hook]) ##### Enabling SageMaker Debugger ########### # Set eval mode so SMDebug can classify the steps into eval mode hook.set_mode(smd.modes.EVAL) ##### Enabling SageMaker Debugger ########### # pass hook to hooks parameter of evaluate method mnist_classifier.evaluate(input_fn=eval_input_fn, steps=args.num_eval_steps, hooks=[hook])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, help="S3 path for the model") parser.add_argument("--lr", type=float, help="Learning Rate", default=0.001) parser.add_argument("--steps", type=int, help="Number of steps to run", default=100) parser.add_argument("--scale", type=float, help="Scaling factor for inputs", default=1.0) parser.add_argument("--random_seed", type=bool, default=False) parser.add_argument("--out_dir", type=str) parser.add_argument("--save_interval", type=int, default=500) args = parser.parse_args() # these random seeds are only intended for test purpose. # for now, 2,2,12 could promise no assert failure when running tests # if you wish to change the number, notice that certain steps' tensor value may be capable of variation if args.random_seed: tf.set_random_seed(2) np.random.seed(2) random.seed(12) hook = smd.EstimatorHook( out_dir=args.out_dir, include_collections=["weights", "gradients"], save_config=smd.SaveConfig(save_interval=args.save_interval), ) # Network definition # Note the use of name scopes with tf.name_scope("foobar"): x = tf.placeholder(shape=(None, 2), dtype=tf.float32) w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1") with tf.name_scope("foobaz"): w0 = [[1], [1.0]] y = tf.matmul(x, w0) loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss") hook.add_to_collection("losses", loss) global_step = tf.Variable(17, name="global_step", trainable=False) increment_global_step_op = tf.assign(global_step, global_step + 1) optimizer = tf.train.AdamOptimizer(args.lr) # Wrap the optimizer with wrap_optimizer so smdebug can find gradients to save optimizer = hook.wrap_optimizer(optimizer) # use this wrapped optimizer to minimize loss optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op) # pass the hook to hooks parameter of monitored session sess = tf.train.MonitoredSession(hooks=[hook]) # use this session for running the tensorflow model hook.set_mode(smd.modes.TRAIN) for i in range(args.steps): x_ = np.random.random((10, 2)) * args.scale _loss, opt, gstep = sess.run( [loss, optimizer_op, increment_global_step_op], {x: x_}) print(f"Step={i}, Loss={_loss}") hook.set_mode(smd.modes.EVAL) for i in range(args.steps): x_ = np.random.random((10, 2)) * args.scale sess.run([loss, increment_global_step_op], {x: x_})
def test_only_w_g(out_dir): pre_test_clean_up() hook = smd.SessionHook(out_dir, save_all=False, save_config=smd.SaveConfig(save_interval=2)) helper_test_only_w_g(out_dir, hook)
def helper_mirrored( trial_dir, save_all=False, num_steps=3, save_config=None, reduction_config=None, include_collections=None, steps=None, zcc=False, eval_distributed=False, include_workers="all", ): num_gpus = get_available_gpus() num_devices = num_gpus if num_gpus > 0 else 1 batch_size = 10 * num_devices # input_fn which serves Dataset input_fn_provider = InputFnProvider( per_device_batch_size(batch_size, num_devices)) # Use multiple GPUs by MirroredStragtegy. # All avaiable GPUs will be used if `num_gpus` is omitted. # if num_devices > 1: distribution = tf.contrib.distribute.MirroredStrategy() # print("### Doing Multi GPU Training") # else: # distribution = None # Pass to RunConfig config = tf.estimator.RunConfig( train_distribute=distribution, eval_distribute=distribution if eval_distributed else None, model_dir="/tmp/mnist_convnet_model", ) if save_config is None: save_config = smd.SaveConfig(save_interval=2) if include_collections is None: include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, ] if not zcc: ts_hook = smd.SessionHook( out_dir=trial_dir, save_all=save_all, include_collections=include_collections, save_config=save_config, reduction_config=reduction_config, include_workers=include_workers, ) else: print("zcc is passed. ignoring include_collections and save_config") mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, config=config) if steps is None: steps = ["train"] for s in steps: if s == "train": print("Starting train") if not zcc: ts_hook.set_mode(smd.modes.TRAIN) # Train the model mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps) elif s == "eval": print("Starting eval") if not zcc: ts_hook.set_mode(smd.modes.EVAL) # Evaluate the model and print results mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps) elif s == "predict": print("Starting predict") if not zcc: ts_hook.set_mode(smd.modes.PREDICT) # Evaluate the model and print results p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook]) else: p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn) for i in range(num_steps): next(p) get_hook()._cleanup() return distribution