示例#1
0
def test_new_graph(out_dir):
    # tests that we can correctly interpret an explicitly created graph
    g1 = tf.get_default_graph()
    g = tf.Graph()
    with g.as_default():
        assert g != g1
        assert g == tf.get_default_graph()
        hook = smd.SessionHook(
            out_dir,
            include_collections=["weights", "losses", "scalars"],
            save_config=smd.SaveConfig(save_steps=[0, 1, 2, 3]),
        )
        with tf.name_scope("foobar"):
            x = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1")
        with tf.name_scope("foobaz"):
            w0 = [[1], [1.0]]
            y = tf.matmul(x, w0)
        loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss")
        hook.get_collection("losses").add(loss)
        global_step = tf.Variable(17, name="global_step", trainable=False)
        increment_global_step_op = tf.assign(global_step, global_step + 1)

        optimizer = tf.train.AdamOptimizer(0.1)
        optimizer = hook.wrap_optimizer(optimizer)
        optimizer_op = optimizer.minimize(loss,
                                          global_step=increment_global_step_op)
        sess = tf.train.MonitoredSession(hooks=[hook])
        for i in range(5):
            x_ = np.random.random((10, 2)) * 0.1
            sess.run([loss, optimizer_op, increment_global_step_op], {x: x_})
        sess.close()
        tr = create_trial(out_dir)
        assert len(tr.tensor_names())
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir",
                        type=str,
                        default="./model_keras_resnet")
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.KerasHook(
        out_dir=opt.out_dir,
        include_collections=["weights", "gradients", "losses"],
        save_config=smd.SaveConfig(save_interval=opt.save_interval),
    )

    optimizer = tf.keras.optimizers.Adam()

    ##### Enabling SageMaker Debugger ###########
    # wrap the optimizer so the hook can identify the gradients
    optimizer = hook.wrap_optimizer(optimizer)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    # start the training.
    train(opt.batch_size, opt.epoch, model, hook)
def test_subclassed_model(out_dir):
    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data("MNIST-data")
    x_train, x_test = x_train / 255.0, x_test / 255.0

    # Add a channels dimension
    x_train = x_train[..., tf.newaxis]
    x_test = x_test[..., tf.newaxis]

    # Create an instance of the model
    model = MyModel()

    train_ds = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000, seed=123).batch(2)
    )

    MyModel.hook = smd.KerasHook(
        out_dir,
        save_all=True,
        save_config=smd.SaveConfig(save_steps=[x for x in range(10)], save_interval=1),
    )

    MyModel.hook.register_model(model)
    model.compile(optimizer="Adam", loss="mse", run_eagerly=True)
    model.fit(train_ds, epochs=1, steps_per_epoch=10, callbacks=[MyModel.hook])

    trial = smd.create_trial(out_dir)
    assert len(trial.tensor_names(collection=smd.CollectionKeys.LAYERS)) == 8

    assert trial.tensor_names(collection=smd.CollectionKeys.INPUTS) == ["model_input"]
    assert trial.tensor_names(collection=smd.CollectionKeys.OUTPUTS) == ["labels", "predictions"]
    assert trial.tensor_names(collection=smd.CollectionKeys.LOSSES) == ["loss"]
    assert len(trial.tensor_names(collection=smd.CollectionKeys.GRADIENTS)) == 6
def test_mnist(out_dir, on_s3=False):
    if on_s3:
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        bucket = "smdebug-testing"
        prefix = "outputs/hooks/estimator_modes/" + run_id
        out_dir = f"s3://{bucket}/{prefix}"
    help_test_mnist(out_dir,
                    save_config=smd.SaveConfig(save_interval=2),
                    num_steps=2,
                    steps=None)
    helper_test_mnist_trial(out_dir)
def test_shapes(out_dir, save_raw_tensor=False):
    pre_test_clean_up()
    rdnc = smd.ReductionConfig(save_shape=True,
                               save_raw_tensor=save_raw_tensor)
    hook = smd.SessionHook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_interval=1),
        reduction_config=rdnc,
        include_collections=["weights", "gradients", "losses"],
    )
    simple_model(hook)
    verify_shapes(out_dir, 0)
def test_mode_changes(out_dir):
    help_test_mnist(
        out_dir,
        save_config=smd.SaveConfig(save_interval=2),
        num_steps=2,
        steps=["train", "eval", "train", "eval", "train", "train"],
    )
    tr = create_trial(out_dir)
    print(tr.steps(), tr.steps(mode=smd.modes.TRAIN),
          tr.steps(mode=smd.modes.EVAL))
    assert len(tr.steps()) == 6
    assert len(tr.steps(mode=smd.modes.TRAIN)) == 4
    assert len(tr.steps(mode=smd.modes.EVAL)) == 2
    assert len(tr.tensor_names()) == 13
def test_mnist_shapes(out_dir, on_s3=False):
    if on_s3:
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        bucket = "smdebug-testing"
        prefix = "outputs/hooks/estimator_modes/" + run_id
        out_dir = f"s3://{bucket}/{prefix}"
    help_test_mnist(
        out_dir,
        save_all=True,
        save_config=smd.SaveConfig(save_steps=[0]),
        num_steps=1,
        steps=None,
        reduction_config=smd.ReductionConfig(save_shape=True),
    )
    verify_shapes(out_dir, 0)
示例#8
0
def test_reductions(out_dir, save_raw_tensor=False):
    pre_test_clean_up()
    rdnc = smd.ReductionConfig(
        reductions=ALLOWED_REDUCTIONS,
        abs_reductions=ALLOWED_REDUCTIONS,
        norms=ALLOWED_NORMS,
        abs_norms=ALLOWED_NORMS,
        save_raw_tensor=save_raw_tensor,
    )
    hook = smd.SessionHook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_interval=1),
        reduction_config=rdnc,
        include_collections=["weights", "gradients", "losses"],
    )
    helper_test_reductions(out_dir, hook, save_raw_tensor)
def test_multiple_inputs(out_dir):
    my_model = MyModel()
    hook = smd.KerasHook(
        out_dir, save_all=True, save_config=smd.SaveConfig(save_steps=[0], save_interval=1)
    )

    hook.register_model(my_model)
    x_train = np.random.random((1000, 20))
    y_train = np.random.random((1000, 1))
    my_model.compile(optimizer="Adam", loss="mse", run_eagerly=True)
    my_model.fit(x_train, y_train, epochs=1, steps_per_epoch=1, callbacks=[hook])

    trial = create_trial(path=out_dir)
    tnames = sorted(trial.tensor_names(collection=smd.CollectionKeys.LAYERS))
    assert "concatenate" in tnames[0]
    assert len(trial.tensor(tnames[0]).value(0)) == 2
    assert trial.tensor(tnames[0]).shape(0) == (2, 1000, 20)
def test_mnist_local_multi_save_configs(out_dir, on_s3=False):
    # Runs in 0:04
    if on_s3:
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        bucket = "smdebug-testing"
        prefix = "outputs/hooks/estimator_modes/" + run_id
        out_dir = f"s3://{bucket}/{prefix}"
    help_test_mnist(
        out_dir,
        smd.SaveConfig({
            smd.modes.TRAIN: smd.SaveConfigMode(save_interval=2),
            smd.modes.EVAL: smd.SaveConfigMode(save_interval=3),
        }),
        include_collections=["losses"],
        num_steps=3,
    )
    helper_test_multi_save_configs_trial(out_dir)
def test_functional_model(out_dir, tf_eager_mode):
    if tf_eager_mode is False:
        tf.compat.v1.disable_eager_execution()
    else:
        return
    num_classes = 10
    train_ds, test_ds = create_dataset()

    # Input image dimensions
    img_rows, img_cols = 28, 28

    img_inputs = Input(shape=(28, 28, 1))
    x = Conv2D(32, kernel_size=(3, 3), activation="relu")(img_inputs)
    x1 = Conv2D(64, (3, 3), activation="relu")(x)
    x = MaxPooling2D(pool_size=(2, 2))(x1)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.5)(x)
    out = Dense(num_classes, activation="softmax")(x)

    model = tf.keras.models.Model(inputs=img_inputs, outputs=out)

    smd_callback = smd.KerasHook(export_tensorboard=False,
                                 out_dir=out_dir,
                                 include_collections=["custom"])

    smd_callback.get_collection("custom").add_for_mode([x1],
                                                       mode=smd.modes.TRAIN)
    smd_callback.save_config = smd.SaveConfig(save_interval=1)
    opt = tf.keras.optimizers.Adadelta(1.0)

    model.compile(
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        optimizer=opt,
        experimental_run_tf_function=False,
    )

    callbacks = [smd_callback]
    model.fit(train_ds, epochs=1, steps_per_epoch=100, callbacks=callbacks)

    trial = smd.create_trial(out_dir)
    assert len(trial.tensor_names(collection="custom")) == 1
示例#12
0
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir",
                        type=str,
                        default="./model_keras_resnet")
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.KerasHook(
        out_dir=opt.out_dir,
        # Information on default collections https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#default-collections-saved
        include_collections=[
            "weights",
            "biases",
            "default",
            "gradients",
            "optimizer_variables",
            "outputs",
        ],
        save_config=smd.SaveConfig(save_interval=opt.save_interval),
    )

    optimizer = tf.keras.optimizers.Adam()

    ##### Enabling SageMaker Debugger ###########
    # wrap the optimizer so the hook can identify the gradients
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    # start the training.
    train(opt.batch_size, opt.epoch, model, hook)
def main():
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--epoch", type=int, default=3)
    parser.add_argument("--model_dir",
                        type=str,
                        default="./model_keras_resnet")
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    opt = parser.parse_args()

    model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.KerasHook(
        out_dir=opt.out_dir,
        # Information on default collections https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#default-collections-saved
        include_collections=["weights", "biases", "default", "gradients"],
        save_config=smd.SaveConfig(save_interval=opt.save_interval),
    )

    # start the training.
    train(opt.batch_size, opt.epoch, model, hook)
            print('Not properly initialized...')

        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
        if use_amp:
            # loss scaling is currently required when using mixed precision
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, 'dynamic')

        callbacks = []
        if enable_sagemaker_debugger:
            import smdebug.tensorflow as smd
            callback = smd.KerasHook(
                out_dir=output_data_dir,
                export_tensorboard=True,
                tensorboard_dir=tensorboard_logs_path,
                save_config=smd.SaveConfig(save_interval=100),
                #                                     save_all=True,
                include_collections=['metrics', 'losses', 'sm_metrics'],
                include_workers='all')
            callbacks.append(callback)
            optimizer = callback.wrap_optimizer(optimizer)
        else:
            callback = tf.keras.callbacks.TensorBoard(
                log_dir=tensorboard_logs_path)
            callbacks.append(callback)

        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

        model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
        print('Trained model {}'.format(model))
def main(args):
    # Horovod: initialize Horovod.
    hvd.init()

    if not args.use_only_cpu:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    else:
        config = None

    K.set_session(tf.Session(config=config))

    batch_size = 128
    num_classes = 10

    # Horovod: adjust number of epochs based on number of GPUs.
    epochs = int(math.ceil(args.num_epochs / hvd.size()))

    # Input image dimensions
    img_rows, img_cols = 28, 28

    # The data, shuffled and split between train and test sets
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    if K.image_data_format() == "channels_first":
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print("x_train shape:", x_train.shape)
    print(x_train.shape[0], "train samples")
    print(x_test.shape[0], "test samples")

    # Convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model = Sequential()
    model.add(
        Conv2D(32,
               kernel_size=(3, 3),
               activation="relu",
               input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    # Horovod: adjust learning rate based on number of GPUs.
    opt = keras.optimizers.Adadelta(1.0 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    smd_hook = smd.KerasHook(
        out_dir=args.out_dir,
        save_config=smd.SaveConfig(save_interval=args.save_interval),
        include_collections=["weights", "gradients"],
        include_workers=args.include_workers,
    )

    ##### Enabling SageMaker Debugger ###########
    # wrapping optimizer so hook can identify gradients
    opt = smd_hook.wrap_optimizer(opt)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=opt,
                  metrics=["accuracy"])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        ##### Enabling SageMaker Debugger ###########
        # adding smd hook as a callback
        smd_hook,
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(
                os.path.join(args.model_dir, "checkpoint-{epoch}.h5")))

    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        callbacks=callbacks,
        epochs=epochs,
        verbose=1 if hvd.rank() == 0 else 0,
        validation_data=(x_test, y_test),
    )
    score = model.evaluate(x_test, y_test, verbose=0)
    print("Test loss:", score[0])
    print("Test accuracy:", score[1])
示例#16
0
def test_mnist_local(out_dir):
    help_test_mnist(out_dir, smd.SaveConfig(save_interval=2), num_steps=2)
    tr = create_trial(out_dir)
    assert len(tr.collection("losses").tensor_names) == 1
    for t in tr.collection("losses").tensor_names:
        assert len(tr.tensor(t).steps()) == 3
示例#17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--random_seed", type=bool, default=False)
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    parser.add_argument("--num_epochs",
                        type=int,
                        default=5,
                        help="Number of epochs to train for")
    parser.add_argument(
        "--num_steps",
        type=int,
        help=
        "Number of steps to train for. If this is passed, it overrides num_epochs",
    )
    parser.add_argument(
        "--num_eval_steps",
        type=int,
        help="Number of steps to evaluate for. If this"
        "is passed, it doesnt evaluate over the full eval set",
    )
    parser.add_argument("--model_dir", type=str, default="/tmp/mnist_model")
    args = parser.parse_args()

    if args.random_seed:
        tf.set_random_seed(2)
        np.random.seed(2)
        random.seed(12)

    ##### Enabling SageMaker Debugger ###########
    # creating hook
    hook = smd.EstimatorHook(
        out_dir=args.out_dir,
        include_collections=["weights", "gradients"],
        save_config=smd.SaveConfig(save_interval=args.save_interval),
    )

    def cnn_model_fn(features, labels, mode):
        """Model function for CNN."""
        # Input Layer
        input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

        # Convolutional Layer #1
        conv1 = tf.layers.conv2d(
            inputs=input_layer,
            filters=32,
            kernel_size=[5, 5],
            padding="same",
            activation=tf.nn.relu,
        )

        # Pooling Layer #1
        pool1 = tf.layers.max_pooling2d(inputs=conv1,
                                        pool_size=[2, 2],
                                        strides=2)

        # Convolutional Layer #2 and Pooling Layer #2
        conv2 = tf.layers.conv2d(inputs=pool1,
                                 filters=64,
                                 kernel_size=[5, 5],
                                 padding="same",
                                 activation=tf.nn.relu)
        pool2 = tf.layers.max_pooling2d(inputs=conv2,
                                        pool_size=[2, 2],
                                        strides=2)

        # Dense Layer
        pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
        dense = tf.layers.dense(inputs=pool2_flat,
                                units=1024,
                                activation=tf.nn.relu)
        dropout = tf.layers.dropout(
            inputs=dense,
            rate=0.4,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        # Logits Layer
        logits = tf.layers.dense(inputs=dropout, units=10)

        predictions = {
            # Generate predictions (for PREDICT and EVAL mode)
            "classes": tf.argmax(input=logits, axis=1),
            # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
            # `logging_hook`.
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
        }

        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions)

        # Calculate Loss (for both TRAIN and EVAL modes)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)

        # Configure the Training Op (for TRAIN mode)
        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=args.lr)

            ##### Enabling SageMaker Debugger ###########
            # Wrap your optimizer as follows to help SageMaker Debugger identify gradients
            # This does not change your optimization logic, it returns back the same optimizer
            optimizer = hook.wrap_optimizer(optimizer)

            train_op = optimizer.minimize(
                loss=loss, global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        # Add evaluation metrics (for EVAL mode)
        eval_metric_ops = {
            "accuracy":
            tf.metrics.accuracy(labels=labels,
                                predictions=predictions["classes"])
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metric_ops)

    # Load training and eval data
    ((train_data, train_labels),
     (eval_data, eval_labels)) = tf.keras.datasets.mnist.load_data()

    train_data = train_data / np.float32(255)
    train_labels = train_labels.astype(np.int32)  # not required

    eval_data = eval_data / np.float32(255)
    eval_labels = eval_labels.astype(np.int32)  # not required

    mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                              model_dir=args.model_dir)

    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=128,
        num_epochs=args.num_epochs,
        shuffle=True,
    )

    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)

    ##### Enabling SageMaker Debugger ###########
    # Set training mode so SMDebug can classify the steps into training mode
    hook.set_mode(smd.modes.TRAIN)

    ##### Enabling SageMaker Debugger ###########
    # pass hook to hooks parameter of train method
    mnist_classifier.train(input_fn=train_input_fn,
                           steps=args.num_steps,
                           hooks=[hook])

    ##### Enabling SageMaker Debugger ###########
    # Set eval mode so SMDebug can classify the steps into eval mode
    hook.set_mode(smd.modes.EVAL)

    ##### Enabling SageMaker Debugger ###########
    # pass hook to hooks parameter of evaluate method
    mnist_classifier.evaluate(input_fn=eval_input_fn,
                              steps=args.num_eval_steps,
                              hooks=[hook])
示例#18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", type=str, help="S3 path for the model")
    parser.add_argument("--lr",
                        type=float,
                        help="Learning Rate",
                        default=0.001)
    parser.add_argument("--steps",
                        type=int,
                        help="Number of steps to run",
                        default=100)
    parser.add_argument("--scale",
                        type=float,
                        help="Scaling factor for inputs",
                        default=1.0)
    parser.add_argument("--random_seed", type=bool, default=False)
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--save_interval", type=int, default=500)
    args = parser.parse_args()

    # these random seeds are only intended for test purpose.
    # for now, 2,2,12 could promise no assert failure when running tests
    # if you wish to change the number, notice that certain steps' tensor value may be capable of variation
    if args.random_seed:
        tf.set_random_seed(2)
        np.random.seed(2)
        random.seed(12)

    hook = smd.EstimatorHook(
        out_dir=args.out_dir,
        include_collections=["weights", "gradients"],
        save_config=smd.SaveConfig(save_interval=args.save_interval),
    )

    # Network definition
    # Note the use of name scopes
    with tf.name_scope("foobar"):
        x = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1")
    with tf.name_scope("foobaz"):
        w0 = [[1], [1.0]]
        y = tf.matmul(x, w0)
    loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss")

    hook.add_to_collection("losses", loss)

    global_step = tf.Variable(17, name="global_step", trainable=False)
    increment_global_step_op = tf.assign(global_step, global_step + 1)

    optimizer = tf.train.AdamOptimizer(args.lr)

    # Wrap the optimizer with wrap_optimizer so smdebug can find gradients to save
    optimizer = hook.wrap_optimizer(optimizer)

    # use this wrapped optimizer to minimize loss
    optimizer_op = optimizer.minimize(loss,
                                      global_step=increment_global_step_op)

    # pass the hook to hooks parameter of monitored session
    sess = tf.train.MonitoredSession(hooks=[hook])

    # use this session for running the tensorflow model
    hook.set_mode(smd.modes.TRAIN)
    for i in range(args.steps):
        x_ = np.random.random((10, 2)) * args.scale
        _loss, opt, gstep = sess.run(
            [loss, optimizer_op, increment_global_step_op], {x: x_})
        print(f"Step={i}, Loss={_loss}")

    hook.set_mode(smd.modes.EVAL)
    for i in range(args.steps):
        x_ = np.random.random((10, 2)) * args.scale
        sess.run([loss, increment_global_step_op], {x: x_})
示例#19
0
def test_only_w_g(out_dir):
    pre_test_clean_up()
    hook = smd.SessionHook(out_dir,
                           save_all=False,
                           save_config=smd.SaveConfig(save_interval=2))
    helper_test_only_w_g(out_dir, hook)
示例#20
0
def helper_mirrored(
    trial_dir,
    save_all=False,
    num_steps=3,
    save_config=None,
    reduction_config=None,
    include_collections=None,
    steps=None,
    zcc=False,
    eval_distributed=False,
    include_workers="all",
):
    num_gpus = get_available_gpus()
    num_devices = num_gpus if num_gpus > 0 else 1
    batch_size = 10 * num_devices

    # input_fn which serves Dataset
    input_fn_provider = InputFnProvider(
        per_device_batch_size(batch_size, num_devices))

    # Use multiple GPUs by MirroredStragtegy.
    # All avaiable GPUs will be used if `num_gpus` is omitted.
    # if num_devices > 1:
    distribution = tf.contrib.distribute.MirroredStrategy()
    # print("### Doing Multi GPU Training")
    # else:
    #     distribution = None
    # Pass to RunConfig
    config = tf.estimator.RunConfig(
        train_distribute=distribution,
        eval_distribute=distribution if eval_distributed else None,
        model_dir="/tmp/mnist_convnet_model",
    )

    if save_config is None:
        save_config = smd.SaveConfig(save_interval=2)

    if include_collections is None:
        include_collections = [
            CollectionKeys.WEIGHTS,
            CollectionKeys.BIASES,
            CollectionKeys.GRADIENTS,
            CollectionKeys.LOSSES,
        ]

    if not zcc:
        ts_hook = smd.SessionHook(
            out_dir=trial_dir,
            save_all=save_all,
            include_collections=include_collections,
            save_config=save_config,
            reduction_config=reduction_config,
            include_workers=include_workers,
        )
    else:
        print("zcc is passed. ignoring include_collections and save_config")

    mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                              config=config)
    if steps is None:
        steps = ["train"]

    for s in steps:
        if s == "train":
            print("Starting train")
            if not zcc:
                ts_hook.set_mode(smd.modes.TRAIN)
                # Train the model
                mnist_classifier.train(
                    input_fn=input_fn_provider.train_input_fn,
                    steps=num_steps,
                    hooks=[ts_hook])
            else:
                mnist_classifier.train(
                    input_fn=input_fn_provider.train_input_fn, steps=num_steps)
        elif s == "eval":
            print("Starting eval")

            if not zcc:
                ts_hook.set_mode(smd.modes.EVAL)
                # Evaluate the model and print results
                mnist_classifier.evaluate(
                    input_fn=input_fn_provider.eval_input_fn,
                    steps=num_steps,
                    hooks=[ts_hook])
            else:
                mnist_classifier.evaluate(
                    input_fn=input_fn_provider.eval_input_fn, steps=num_steps)
        elif s == "predict":
            print("Starting predict")
            if not zcc:
                ts_hook.set_mode(smd.modes.PREDICT)
                # Evaluate the model and print results
                p = mnist_classifier.predict(
                    input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook])
            else:
                p = mnist_classifier.predict(
                    input_fn=input_fn_provider.eval_input_fn)
            for i in range(num_steps):
                next(p)
    get_hook()._cleanup()
    return distribution