def main(_): for filename in FILENAMES: if not os.path.isfile(os.path.join(FLAGS.data_dir, filename)): print(os.path.join(FLAGS.data_dir, filename)) print( 'Make sure training data is in specified directory.\n', 'You should see following files in %s directory:\n' % FLAGS.data_dir, ','.join(FILENAMES)) return # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # Create the model x = tf.placeholder(tf.float32, [None, 784]) # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 10]) # Build the graph for the deep net y_conv, keep_prob = deepnn(x) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(500): batch = mnist.train.next_batch(50) if i % 100 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 1.0 }) print('step %d, training accuracy %g' % (i, train_accuracy)) train_step.run(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 0.5 }) validation_accuracy = accuracy.eval( feed_dict={ x: mnist.validation.images, y_: mnist.validation.labels, keep_prob: 1.0 }) print('Validation accuracy %g' % validation_accuracy) publish({'accuracy': str(validation_accuracy)})
def publish_progress(): logging.debug("starting publish_progress ...") progress_percent = 0 while progress_percent != 100 and not stop_thread: new_progress_percent = progress / max_progress * 100 if max_progress else 100 logging.debug(f"new_progress_percent: %.1f" % new_progress_percent) if new_progress_percent != progress_percent: progress_percent = new_progress_percent metrics = {PROGRESS_METRIC_KEY: str("%.1f" % progress_percent)} logging.debug("publishing metrics ...") publish(metrics) sleep(1)
def main(_): mnist = tf.contrib.learn.datasets.mnist.read_data_sets(FLAGS.data_dir) images_placeholder = tf.placeholder(tf.float32, [None, 784]) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) train = tf.train.AdamOptimizer().minimize(loss) accuracy = tf.reduce_mean( tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) summary_op = tf.summary.merge_all() # As previously mentioned summaries are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user and # tensorboard. summary_writer = tf.summary.FileWriter( os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard")) session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() for i in range(FLAGS.steps): images, labels = mnist.train.next_batch(64) _, summary_out, loss_val, accuracy_val = session.run( [train, summary_op, loss, accuracy], feed_dict={ images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5 }) if i % 100 == 0: print("Step {}, Loss: {}, Accuracy: {}".format( i, loss_val, accuracy_val)) summary_writer.add_summary(summary_out, global_step=i) # Example of nauta metrics usage. Simply construct dict of keys and string values that you want to bind with # them and call publish. Old values of the same key will be overwritten. publish({ "global_step": str(i), "loss": str(loss_val), "accuracy": str(accuracy_val) }) # Validate trained model on MNIST validation set. validation_accuracy_val = session.run(accuracy, feed_dict={ images_placeholder: mnist.validation.images, labels_placeholder: mnist.validation.labels }) print("Validation accuracy: {}".format(validation_accuracy_val)) # As previously mentioned checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user. saver.save( session, os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model.ckpt")) # Publish validation accuracy the same way as before. publish({"validation_accuracy": str(validation_accuracy_val)}) # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. if FLAGS.export_dir is not "": export_dir = os.path.join(FLAGS.export_dir, str(MODEL_VERSION)) builder = tf.saved_model.builder.SavedModelBuilder(export_dir) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info(images_placeholder) }, outputs={ MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores) }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={MODEL_SIGNATURE_NAME: prediction_signature}, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save()
def main(_): # Horovod: initialize Horovod. hvd.init() hvd_size = hvd.size() print("hvd size: {}".format(hvd_size)) parser = argparse.ArgumentParser() parser.add_argument( '--data_dir', type=str, help='Directory which contains dataset') parser.add_argument( '--steps', type=int, default=300, help='steps') FLAGS, _ = parser.parse_known_args() # Ensure data directory passed to the script contains proper dataset if FLAGS.data_dir is not None: if not os.path.isdir(FLAGS.data_dir): print("Provided data_dir path: {} does not exist!".format(FLAGS.data_dir)) sys.exit(1) for filename in FILENAMES: if not os.path.isfile(os.path.join(FLAGS.data_dir, filename)): print("Required file: {} does not exist!".format(filename)) sys.exit(1) data_dir = FLAGS.data_dir else: data_dir = os.path.join('/tensorflow/test', 'input_data_{}'.format(hvd.rank())) mnist = learn.datasets.mnist.read_data_sets(data_dir) # Name images placeholder to be able to retrieve it from saved meta graph. images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now # also does not contain any training specific ops, so it is optimized for serving too. tf.train.export_meta_graph("graph.meta", as_text=True) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) # Define summary ops to save summaries for later use in tensorboard. tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("loss", loss) summary_op = tf.summary.merge_all() # Horovod: adjust learning rate based on number of workers. optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size()) global_step = tf.contrib.framework.get_or_create_global_step() # Wrap standard optimizer in Horovod distributed one. train = hvd.DistributedOptimizer(optimizer).minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of workers. tf.train.StopAtStepHook(FLAGS.steps // hvd_size), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Only master saves summaries. if hvd.rank() == 0: hooks += [ # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by # tensorboard. tf.train.SummarySaverHook(save_steps=10, output_dir=os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard"), summary_op=summary_op)] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user. checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints") if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks) as mon_sess: while not mon_sess.should_stop(): images, labels = mnist.train.next_batch(64) _, loss_val, accuracy_val, global_step_val = mon_sess.run( [train, loss, accuracy, global_step], feed_dict={images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5}) # Only master publishes metrics. if hvd.rank() == 0: # Publish metrics just like in the single node example. publish({"loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val)}) # Save servable model only from Horovod master. if hvd.rank() == 0: # Create a new graph to import the previously exported one. with tf.Graph().as_default(): # Import previously saved meta graph. restorer = tf.train.import_meta_graph("graph.meta") with tf.Session() as session: checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) restorer.restore(session, checkpoint_file) # Get handlers for images placeholder and scores op with names defined before. images_placeholder = tf.get_default_graph().get_tensor_by_name(INPUT_NAME + ":0") scores = tf.get_default_graph().get_tensor_by_name(SCORES_NAME + ":0") # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(EXPERIMENT_OUTPUT_PATH, "1")) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info(images_placeholder)}, outputs={MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores)}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( session, [tf.saved_model.tag_constants.SERVING], signature_def_map={ MODEL_SIGNATURE_NAME: prediction_signature }, main_op=tf.tables_initializer(), strip_default_attrs=True) builder.save()
def main(_): cluster, job_name, task_index = parse_tf_config() # Create a cluster from the parameter server and worker hosts. cluster_spec = tf.train.ClusterSpec(cluster) # Create and start a server for the local task. server = tf.train.Server(cluster_spec, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() return # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:{task_index}".format(task_index=task_index), cluster=cluster ) ): # Name images placeholder to be able to retrieve it from saved meta graph. images_placeholder = tf.placeholder(tf.float32, [None, 784], name=INPUT_NAME) dense_dropout_placeholder = tf.placeholder_with_default(1.0, []) labels_placeholder = tf.placeholder(tf.int64, [None]) logits, scores, predictions = build_net(images_placeholder, dense_dropout_placeholder) loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10), logits) global_step = tf.train.get_or_create_global_step() train = tf.train.AdamOptimizer().minimize(loss, global_step=global_step) accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, labels_placeholder), tf.float32)) tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) summary_op = tf.summary.merge_all() # As mentioned above summaries will be saved to EXPERIMENT_OUTPUT_PATH so that they can be automatically # discovered by tensorboard. summary_writer = tf.summary.FileWriter(os.path.join(EXPERIMENT_OUTPUT_PATH, "tensorboard")) # These ops will be later needed to save servable model. init_op = tf.initialize_all_variables() saver = tf.train.Saver() # Export meta graph to restore it later when saving. tf.train.export_meta_graph("graph.meta", as_text=True) is_chief = task_index == 0 # Create a "supervisor", which oversees the training process. sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=EXPERIMENT_OUTPUT_PATH, init_op=init_op, summary_op=summary_op, saver=None, global_step=global_step, summary_writer=None) # Read/download dataset locally. mnist = input_data.read_data_sets(FLAGS.data_dir) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # Loop until the supervisor shuts down or 500 steps have completed. global_step_val = 0 while not sv.should_stop() and global_step_val < 500: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. images, labels = mnist.train.next_batch(64) _, loss_val, accuracy_val, global_step_val, summary_out = sess.run( [train, loss, accuracy, global_step, summary_op], feed_dict={images_placeholder: images, labels_placeholder: labels, dense_dropout_placeholder: 0.5}) # Only chief publishes metrics. if is_chief: # Publish metrics just like in the single node example. publish({"loss": str(loss_val), "accuracy": str(accuracy_val), "global_step": str(global_step_val)}) if global_step_val % 100 == 0: print("Step {}, Loss: {}, Accuracy: {}".format(global_step_val, loss_val, accuracy_val)) # Save model every 100 steps without chief constraint because for example step 100 can only be taken # on 1 worker so they won't interfere with each other. As mentioned previously - checkpoints are saved # to EXPERIMENT_OUTPUT_PATH to be accessible by user. saver.save(sess, os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model"), global_step=global_step_val) # Only chief writes summary. if is_chief: summary_writer.add_summary(summary_out, global_step=global_step_val) # Save model by chief at the end. if is_chief: saver.save(sess, os.path.join(EXPERIMENT_OUTPUT_PATH, "checkpoints", "model"), global_step=global_step_val) # Unfinalize the graph as distributed training process already finalized it and we tf.get_default_graph()._unsafe_unfinalize() # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user. builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001")) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={MODEL_INPUT_NAME: tf.saved_model.utils.build_tensor_info(images_placeholder)}, outputs={MODEL_OUTPUT_NAME: tf.saved_model.utils.build_tensor_info(scores)}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ MODEL_SIGNATURE_NAME: prediction_signature }, main_op=tf.tables_initializer(), clear_devices=True, strip_default_attrs=True) builder.save() # Model saving can hang whole multinode experiment when done at the end. Sleep to give chief time to save. time.sleep(30) # Ask for all the services to stop. sv.stop()
def on_epoch_end(self, epoch, logs: dict = None): publish({'accuracy': str(logs.get('acc')), 'loss': str(logs.get('loss')), 'validation_accuracy': str(logs.get('val_acc')), 'validation_loss': str(logs.get('val_loss'))})