示例#1
0
def main(_):
    for filename in FILENAMES:
        if not os.path.isfile(os.path.join(FLAGS.data_dir, filename)):
            print(os.path.join(FLAGS.data_dir, filename))
            print(
                'Make sure training data is in specified directory.\n',
                'You should see following files in %s directory:\n' %
                FLAGS.data_dir, ','.join(FILENAMES))
            return

    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 10])

    # Build the graph for the deep net
    y_conv, keep_prob = deepnn(x)

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(500):
            batch = mnist.train.next_batch(50)
            if i % 100 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1],
                    keep_prob: 1.0
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
            train_step.run(feed_dict={
                x: batch[0],
                y_: batch[1],
                keep_prob: 0.5
            })

        validation_accuracy = accuracy.eval(
            feed_dict={
                x: mnist.validation.images,
                y_: mnist.validation.labels,
                keep_prob: 1.0
            })
        print('Validation accuracy %g' % validation_accuracy)
        publish({'accuracy': str(validation_accuracy)})
示例#2
0
文件: main.py 项目: hzjai0624/nauta
def publish_progress():
    logging.debug("starting publish_progress ...")
    progress_percent = 0
    while progress_percent != 100 and not stop_thread:
        new_progress_percent = progress / max_progress * 100 if max_progress else 100
        logging.debug(f"new_progress_percent: %.1f" % new_progress_percent)
        if new_progress_percent != progress_percent:
            progress_percent = new_progress_percent

            metrics = {PROGRESS_METRIC_KEY: str("%.1f" % progress_percent)}
            logging.debug("publishing metrics ...")
            publish(metrics)

        sleep(1)
示例#3
0
def main(_):
    mnist = tf.contrib.learn.datasets.mnist.read_data_sets(FLAGS.data_dir)

    images_placeholder = tf.placeholder(tf.float32, [None, 784])
    dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
    labels_placeholder = tf.placeholder(tf.int64, [None])

    logits, scores, predictions = build_net(images_placeholder,
                                            dense_dropout_placeholder)

    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10),
                                           logits)
    train = tf.train.AdamOptimizer().minimize(loss)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

    tf.summary.scalar("loss", loss)
    tf.summary.scalar("accuracy", accuracy)
    summary_op = tf.summary.merge_all()

    # As previously mentioned summaries are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user and
    # tensorboard.
    summary_writer = tf.summary.FileWriter(
        os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard"))

    session = tf.Session()
    session.run(tf.global_variables_initializer())

    saver = tf.train.Saver()

    for i in range(FLAGS.steps):
        images, labels = mnist.train.next_batch(64)
        _, summary_out, loss_val, accuracy_val = session.run(
            [train, summary_op, loss, accuracy],
            feed_dict={
                images_placeholder: images,
                labels_placeholder: labels,
                dense_dropout_placeholder: 0.5
            })

        if i % 100 == 0:
            print("Step {}, Loss: {}, Accuracy: {}".format(
                i, loss_val, accuracy_val))

        summary_writer.add_summary(summary_out, global_step=i)

        # Example of nauta metrics usage. Simply construct dict of keys and string values that you want to bind with
        # them and call publish. Old values of the same key will be overwritten.
        publish({
            "global_step": str(i),
            "loss": str(loss_val),
            "accuracy": str(accuracy_val)
        })

    # Validate trained model on MNIST validation set.
    validation_accuracy_val = session.run(accuracy,
                                          feed_dict={
                                              images_placeholder:
                                              mnist.validation.images,
                                              labels_placeholder:
                                              mnist.validation.labels
                                          })
    print("Validation accuracy: {}".format(validation_accuracy_val))

    # As previously mentioned checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user.
    saver.save(
        session,
        os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model.ckpt"))

    # Publish validation accuracy the same way as before.
    publish({"validation_accuracy": str(validation_accuracy_val)})

    # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
    if FLAGS.export_dir is not "":
        export_dir = os.path.join(FLAGS.export_dir, str(MODEL_VERSION))
        builder = tf.saved_model.builder.SavedModelBuilder(export_dir)

        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    MODEL_INPUT_NAME:
                    tf.saved_model.utils.build_tensor_info(images_placeholder)
                },
                outputs={
                    MODEL_OUTPUT_NAME:
                    tf.saved_model.utils.build_tensor_info(scores)
                },
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))

        builder.add_meta_graph_and_variables(
            session, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={MODEL_SIGNATURE_NAME: prediction_signature},
            main_op=tf.tables_initializer(),
            strip_default_attrs=True)

        builder.save()
示例#4
0
def main(_):

    # Horovod: initialize Horovod.
    hvd.init()
    hvd_size = hvd.size()
    print("hvd size: {}".format(hvd_size))

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--data_dir',
        type=str,
        help='Directory which contains dataset')
    parser.add_argument(
        '--steps',
        type=int,
        default=300,
        help='steps')

    FLAGS, _ = parser.parse_known_args()

    # Ensure data directory passed to the script contains proper dataset
    if FLAGS.data_dir is not None:
        if not os.path.isdir(FLAGS.data_dir):
            print("Provided data_dir path: {} does not exist!".format(FLAGS.data_dir))
            sys.exit(1)

        for filename in FILENAMES:
            if not os.path.isfile(os.path.join(FLAGS.data_dir, filename)):
                print("Required file: {} does not exist!".format(filename))
                sys.exit(1)
        data_dir = FLAGS.data_dir
    else:
        data_dir = os.path.join('/tensorflow/test', 'input_data_{}'.format(hvd.rank()))

    mnist = learn.datasets.mnist.read_data_sets(data_dir)

    # Name images placeholder to be able to retrieve it from saved meta graph.
    images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME)

    dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
    labels_placeholder = tf.placeholder(tf.int64, [None])
    logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder)

    # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now
    # also does not contain any training specific ops, so it is optimized for serving too.
    tf.train.export_meta_graph("graph.meta", as_text=True)

    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

    # Define summary ops to save summaries for later use in tensorboard.
    tf.summary.scalar("accuracy", accuracy)
    tf.summary.scalar("loss", loss)
    summary_op = tf.summary.merge_all()

    # Horovod: adjust learning rate based on number of workers.
    optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    global_step = tf.contrib.framework.get_or_create_global_step()

    # Wrap standard optimizer in Horovod distributed one.
    train = hvd.DistributedOptimizer(optimizer).minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of workers.
        tf.train.StopAtStepHook(FLAGS.steps // hvd_size),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Only master saves summaries.
    if hvd.rank() == 0:
        hooks += [
            # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by
            # tensorboard.
            tf.train.SummarySaverHook(save_steps=10, output_dir=os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard"),
                                      summary_op=summary_op)]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned
    # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user.
    checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints") if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks) as mon_sess:
        while not mon_sess.should_stop():
            images, labels = mnist.train.next_batch(64)
            _, loss_val, accuracy_val, global_step_val = mon_sess.run(
                [train, loss, accuracy, global_step],
                feed_dict={images_placeholder: images,
                           labels_placeholder: labels,
                           dense_dropout_placeholder: 0.5})

            # Only master publishes metrics.
            if hvd.rank() == 0:
                # Publish metrics just like in the single node example.
                publish({"loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val)})

    # Save servable model only from Horovod master.
    if hvd.rank() == 0:
        # Create a new graph to import the previously exported one.
        with tf.Graph().as_default():
            # Import previously saved meta graph.
            restorer = tf.train.import_meta_graph("graph.meta")
            with tf.Session() as session:
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                restorer.restore(session, checkpoint_file)

                # Get handlers for images placeholder and scores op with names defined before.
                images_placeholder = tf.get_default_graph().get_tensor_by_name(INPUT_NAME + ":0")
                scores = tf.get_default_graph().get_tensor_by_name(SCORES_NAME + ":0")

                # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
                builder = tf.saved_model.builder.SavedModelBuilder(
                    os.path.join(EXPERIMENT_OUTPUT_PATH, "1"))

                prediction_signature = (
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info(images_placeholder)},
                        outputs={MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores)},
                        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))

                builder.add_meta_graph_and_variables(
                    session, [tf.saved_model.tag_constants.SERVING],
                    signature_def_map={
                        MODEL_SIGNATURE_NAME:
                            prediction_signature
                    },
                    main_op=tf.tables_initializer(),
                    strip_default_attrs=True)

                builder.save()
示例#5
0
def main(_):
    cluster, job_name, task_index = parse_tf_config()

    # Create a cluster from the parameter server and worker hosts.
    cluster_spec = tf.train.ClusterSpec(cluster)

    # Create and start a server for the local task.
    server = tf.train.Server(cluster_spec,
                             job_name=job_name,
                             task_index=task_index)

    if job_name == "ps":
        server.join()
        return

    # Assigns ops to the local worker by default.
    with tf.device(
        tf.train.replica_device_setter(
            worker_device="/job:worker/task:{task_index}".format(task_index=task_index),
            cluster=cluster
        )
    ):
        # Name images placeholder to be able to retrieve it from saved meta graph.
        images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME)

        dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
        labels_placeholder = tf.placeholder(tf.int64, [None])

        logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder)

        loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits)
        global_step = tf.train.get_or_create_global_step()
        train = tf.train.AdamOptimizer().minimize(loss, global_step=global_step)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)
        summary_op = tf.summary.merge_all()

        # As mentioned above summaries will be saved to EXPERIMENT_OUTPUT_PATH so that they can be automatically
        # discovered by tensorboard.
        summary_writer = tf.summary.FileWriter(os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard"))

        # These ops will be later needed to save servable model.
        init_op = tf.initialize_all_variables()
        saver = tf.train.Saver()

    # Export meta graph to restore it later when saving.
    tf.train.export_meta_graph("graph.meta", as_text=True)

    is_chief = task_index == 0

    # Create a "supervisor", which oversees the training process.
    sv = tf.train.Supervisor(is_chief=(task_index == 0),
                             logdir=EXPERIMENT_OUTPUT_PATH,
                             init_op=init_op,
                             summary_op=summary_op,
                             saver=None,
                             global_step=global_step,
                             summary_writer=None)

    # Read/download dataset locally.
    mnist = input_data.read_data_sets(FLAGS.data_dir)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
        # Loop until the supervisor shuts down or 500 steps have completed.
        global_step_val = 0
        while not sv.should_stop() and global_step_val < 500:
            # Run a training step asynchronously.
            # See `tf.train.SyncReplicasOptimizer` for additional details on how to
            # perform *synchronous* training.
            images, labels = mnist.train.next_batch(64)
            _, loss_val, accuracy_val, global_step_val, summary_out = sess.run(
                [train, loss, accuracy, global_step, summary_op],
                feed_dict={images_placeholder: images,
                           labels_placeholder: labels,
                           dense_dropout_placeholder: 0.5})

            # Only chief publishes metrics.
            if is_chief:
                # Publish metrics just like in the single node example.
                publish({"loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val)})

            if global_step_val % 100 == 0:
                print("Step {}, Loss: {}, Accuracy: {}".format(global_step_val, loss_val, accuracy_val))
                # Save model every 100 steps without chief constraint because for example step 100 can only be taken
                # on 1 worker so they won't interfere with each other. As mentioned previously - checkpoints are saved
                # to EXPERIMENT_OUTPUT_PATH to be accessible by user.
                saver.save(sess, os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model"),
                           global_step=global_step_val)

            # Only chief writes summary.
            if is_chief:
                summary_writer.add_summary(summary_out, global_step=global_step_val)

        # Save model by chief at the end.
        if is_chief:
            saver.save(sess, os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model"), global_step=global_step_val)

            # Unfinalize the graph as distributed training process already finalized it and we
            tf.get_default_graph()._unsafe_unfinalize()

            # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
            builder = tf.saved_model.builder.SavedModelBuilder(
                os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001"))

            prediction_signature = (
                tf.saved_model.signature_def_utils.build_signature_def(
                    inputs={MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info(images_placeholder)},
                    outputs={MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores)},
                    method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))

            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING],
                signature_def_map={
                    MODEL_SIGNATURE_NAME:
                        prediction_signature
                },
                main_op=tf.tables_initializer(),
                clear_devices=True,
                strip_default_attrs=True)

            builder.save()

    # Model saving can hang whole multinode experiment when done at the end. Sleep to give chief time to save.
    time.sleep(30)

    # Ask for all the services to stop.
    sv.stop()
示例#6
0
 def on_epoch_end(self, epoch, logs: dict = None):
     publish({'accuracy': str(logs.get('acc')),
              'loss': str(logs.get('loss')),
              'validation_accuracy': str(logs.get('val_acc')),
              'validation_loss': str(logs.get('val_loss'))})