def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() with tf.train.MonitoredTrainingSession( is_chief=(ctx.task_index == 0)) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: outputs = sess.run([sq], feed_dict={x: batch}) tf_feed.batch_results(outputs[0]) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding")
def _tf_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph( ) # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) def _get_examples(batch_size): """Generate test data (mocking a queue_runner of file inputs)""" features = tf.random_uniform([batch_size, 2]) # (batch_size x 2) weights = tf.constant([[3.14], [1.618]]) # (2, 1) labels = tf.matmul(features, weights) return features, labels if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x, y_ = _get_examples( 10 ) # no input placeholders, TF code reads (or in this case "generates") input w = tf.Variable(tf.truncated_normal([2, 1]), name='w') y = tf.matmul(x, w, name='y') global_step = tf.Variable(0) cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer( 0.5).minimize(cost, global_step) init_op = tf.global_variables_initializer() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) step = 0 with sv.managed_session(server.target) as sess: while not sv.should_stop() and step < args.steps: opt, weights, step = sess.run( [optimizer, w, global_step]) if (step % 100 == 0): print("step: {}, weights: {}".format( step, weights)) if sv.is_chief: if args.model_dir: # manually save checkpoint ckpt_name = args.model_dir + "/model.ckpt" print("Saving checkpoint to: {}".format(ckpt_name)) saver.save(sess, ckpt_name) sv.stop()
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.build_model() self.execute()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.float32, [None, 2], name='x') y_ = tf.placeholder(tf.float32, [None, 1], name='y_') w = tf.Variable(tf.truncated_normal([2,1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost) init_op = tf.global_variables_initializer() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sv.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if args.input_mapping: if len(batch['x']) > 0: feed = { x: batch['x'], y_: batch['y_'] } opt = sess.run(optimizer, feed_dict=feed) if sv.is_chief: if args.model_dir: # manually save checkpoint ckpt_name = args.model_dir + "/model.ckpt" print("Saving checkpoint to: {}".format(ckpt_name)) saver.save(sess, ckpt_name) elif args.export_dir: # export a saved_model signatures = { 'test_key': { 'inputs': { 'features': x }, 'outputs': { 'prediction': y }, 'method_name': 'test' } } TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures) else: print("WARNING: model state not saved.") sv.stop()
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.create_tmp_dir() self.process() self.delete_tmp_dir()
def main_fun(argv, ctx): import tensorflow as tf worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec, server = TFNode.start_cluster_server(ctx) ''' if job_name == "ps": time.sleep((worker_num + 1) * 5) if job_name == "ps": server.join() elif job_name == "worker":''' hello = tf.constant('Hello, TensorFlow!') sess = tf.Session() print(sess.run(hello))
def main_fun(argv, ctx): from src import facenet_distributed_train from src import vipus_distributed_train import sys job_name = ctx.job_name assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' print("argv:", argv) sys.argv = argv cluster, server = TFNode.start_cluster_server(ctx, num_gpus=1) if job_name == 'ps': server.join() else: if argv.model == 'FACENET': facenet_distributed_train.train(server, ctx.cluster_spec, argv, ctx) elif argv.model == 'VIPUS': vipus_distributed_train.train(server, ctx.cluster_spec, argv, ctx)
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) }) tf_feed.batch_results(outputs[0]) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf from inception import inception_eval from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx) inception_eval.evaluate(dataset)
def main_fun(argv, ctx): # extract node metadata from ctx worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' from inception import inception_distributed_train from inception.imagenet_data import ImagenetData import tensorflow as tf # instantiate FLAGS on workers using argv from driver and add job_name and task_id print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS.job_name = job_name FLAGS.task_id = task_index print("FLAGS:", FLAGS.__dict__['__flags']) # Get TF cluster and server instances cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run( [sq], feed_dict={x: tf_feed.next_batch(10)}) tf_feed.batch_results(outputs[0]) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec IMAGE_PIXELS=28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def writeFileToHDFS(): rootdir = '/tmp/mnist_model' client = HdfsClient(hosts='localhost:50070') client.mkdirs('/user/root/mnist_model') for parent,dirnames,filenames in os.walk(rootdir): for dirname in dirnames: print("parent is:{0}".format(parent)) for filename in filenames: client.copy_from_local(os.path.join(parent,filename), os.path.join('/user/root/mnist_model',filename), overwrite=True) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS # logdir = TFNode.hdfs_path(ctx, args.model) logdir = "hdfs:///tmp/" + args.model print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, summary_writer=summary_writer, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() writeFileToHDFS() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import getpass import math import numpy import os import signal import tensorflow as tf import time IMAGE_PIXELS = 28 worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = 100 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of csv image filenames tf_record_pattern = os.path.join(image_dir, 'part-*') images = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "images: {0}".format(images)) image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue") # Setup queue of csv label filenames tf_record_pattern = os.path.join(label_dir, 'part-*') labels = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "labels: {0}".format(labels)) label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue") # Setup reader for image queue img_reader = tf.TextLineReader(name="img_reader") _, img_csv = img_reader.read(image_queue) image_defaults = [[1.0] for col in range(784)] img = tf.pack(tf.decode_csv(img_csv, image_defaults)) # Normalize values to [0,1] norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(img, norm) print_log(worker_num, "image: {0}".format(image)) # Setup reader for label queue label_reader = tf.TextLineReader(name="label_reader") _, label_csv = label_reader.read(label_queue) label_defaults = [[1.0] for col in range(10)] label = tf.pack(tf.decode_csv(label_csv, label_defaults)) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image, label], batch_size, num_threads=args.readers, name="batch_csv") def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of TFRecord filenames tf_record_pattern = os.path.join(path, 'part-*') files = tf.gfile.Glob(tf_record_pattern) queue_name = "file_queue" # split input files across workers, if specified if task_index is not None and num_workers is not None: num_files = len(files) files = files[task_index:num_files:num_workers] queue_name = "file_queue_{0}".format(task_index) print_log(worker_num, "files: {0}".format(files)) file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name) # Setup reader for examples reader = tf.TFRecordReader(name="reader") _, serialized = reader.read(file_queue) feature_def = { 'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) } features = tf.parse_single_example(serialized, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(tf.to_float(features['image']), norm) print_log(worker_num, "image: {0}".format(image)) label = tf.to_float(features['label']) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image, label], batch_size, num_threads=args.readers, name="batch") if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs index = task_index if args.mode == "inference" else None workers = num_workers if args.mode == "inference" else None if args.format == "csv": images = TFNode.hdfs_path(ctx, args.images) labels = TFNode.hdfs_path(ctx, args.labels) x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers) elif args.format == "tfr": images = TFNode.hdfs_path(ctx, args.images) x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers) else: raise ("{0} format not supported for tf input mode".format( args.format)) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) output_dir = TFNode.hdfs_path(ctx, args.output) output_file = tf.gfile.Open("{0}/part-{1:05d}".format( output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 count = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if args.mode == "train": if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step = sess.run( [train_op, summary_op, global_step]) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, pred, acc = sess.run([label, prediction, accuracy]) #print("label: {0}, pred: {1}".format(labels, pred)) print("acc: {0}".format(acc)) for i in range(len(labels)): count += 1 output_file.write("{0} {1}\n".format( labels[i], pred[i])) print("count: {0}".format(count)) if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): # ctx - node metadata like job_name, task_id main_path = args.main_path sys.path.append(main_path + "/CatDog-CNN-Tensorflow-OnSpark/") import tensorflow as tf import tensorflowonspark import conv_net import utils import datetime from image_op import get_tensor tf.app.flags.DEFINE_string('train_dir', main_path + '/data_catsdogs/train', """Directory with training images """) tf.app.flags.DEFINE_string('checkpoint_path', main_path + 'checkpoints/catdog_spark', """Directory with checkpoints """) tf.app.flags.DEFINE_string('graph_path', main_path + 'graphs/catdog_spark', """Directory with graphs """) FLAGS = tf.app.flags.FLAGS cluster, server = TFNode.start_cluster_server(ctx) worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index log_file = main_path + "log_spark.txt" n_epoch = int(args.n_epoch) dataset_size = int(args.dataset_size) batch_size = int(args.batch_size) model = conv_net.CatDogConvNet(FLAGS.checkpoint_path, FLAGS.graph_path, dataset_size=dataset_size, batch_size=batch_size, num_workers=num_executors, task_index=task_index, ctx=ctx, server=server, worker=worker_num) model.training_folder = FLAGS.train_dir model.log_file = main_path + "log_spark.txt" print('building a model') utils.write_log(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), 'Testing the model ...', log_file) with tf.name_scope('data'): # path, train_size, test_size, batch_size, desired_shape=300 train_data, test_data = get_tensor( model.training_folder, int(model.dataset_size * (1 - model.test_percent)), int(model.dataset_size * model.test_percent), model.batch_size, desired_shape=model.desired_shape, num_workers=model.num_workers, task_index=model.task_index) # train_data = train_data.repeat(FLAGS.n_epoch+1) iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) img, model.label = iterator.get_next() # reshape the image to make it work with tf.nn.conv2d: img = tf.reshape( img, shape=[-1, model.desired_shape, model.desired_shape, 1]) model.img = tf.cast(img, tf.float32) model.train_init = iterator.make_initializer( train_data) # initializer for train_data model.test_init = iterator.make_initializer( test_data) # initializer for train_data model.build() print('testing') model.eval_accuracy_spark()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels=4 num_class=2 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 # NN隐藏层 batch_size = args.batch_size #每批次训练的样本数 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 # 数据归一化 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): #-------------普通的NN模型(可以修改成自己的模型)---------------------------------# #↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓# ''' # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") # tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # tf.summary.histogram("softmax_weights", sm_w) ''' # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.Variable(tf.random_normal([5, 5, channels, 32])), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])), # fully connected, 7*7*64 inputs, 1024 outputs 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])), # 1024 inputs, 10 outputs (class prediction) 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.Variable(tf.random_normal([32])), 'bc2': tf.Variable(tf.random_normal([64])), 'bd1': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS*channels], name="x") # mnist 28*28*1 y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1) # tf.summary.image("x_img", x_img) # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) fc1 = tf.nn.relu(fc1) if args.mode == "train": fc1 = tf.nn.dropout(fc1, 0.7) y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) ''' hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # tf.nn.add(tf.nn.matmul(x,hid_w),hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) ''' # global_step = tf.Variable(0) global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y)) # tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑# #---------------上面的模型可以修改成自己的模型------------------------------# # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session logging.basicConfig(level=logging.INFO) print("{0} session ready".format(datetime.now().isoformat())) log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) log.info("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode class ExportHook(tf.train.SessionRunHook): def __init__(self, export_dir, input_tensor, output_tensor): self.export_dir = export_dir self.input_tensor = input_tensor self.output_tensor = output_tensor def end(self, session): print("{} ======= Exporting to: {}".format( datetime.now().isoformat(), self.export_dir)) signatures = { "test_key": { 'inputs': { 'features': self.input_tensor }, 'outputs': { 'prediction': self.output_tensor }, 'method_name': tf.saved_model.signature_constants. PREDICT_METHOD_NAME } } TFNode.export_saved_model(session, self.export_dir, "test_tag", signatures) print("{} ======= Done exporting".format( datetime.now().isoformat())) tf.reset_default_graph( ) # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.float32, [None, 2], name='x') y_ = tf.placeholder(tf.float32, [None, 1], name='y_') w = tf.Variable(tf.truncated_normal([2, 1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square( y, name="y2" ) # extra/optional output for testing multiple output tensors global_step = tf.train.get_or_create_global_step() cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer( 0.5).minimize(cost, global_step) chief_hooks = [ ExportHook(ctx.absolute_path(args.export_dir), x, y) ] if args.export_dir else [] with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(ctx.task_index == 0), checkpoint_dir=args.model_dir, chief_only_hooks=chief_hooks) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if args.input_mapping: if len(batch['x']) > 0: feed = {x: batch['x'], y_: batch['y_']} sess.run(optimizer, feed_dict=feed)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time import logging import cnn_lstm_ctc_ocr #import redis_logger_handler #redis_logger_handler.logging_setup(args.redis) worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name, task_index) logging.info( '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}' .format(worker_name, args.batch_size, args.initial_learning_rate, args.decay_steps, args.decay_rate, args.momentum)) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters CHANNELS = 1 IMAGE_WIDTH = 120 IMAGE_HEIGHT = 45 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def sparse_tuple_from_label(sequences, dtype=numpy.int32): indices = [] values = [] for n, seq in enumerate(sequences): indices.extend(zip([n] * len(seq), range(len(seq)))) values.extend(seq) indices = numpy.asarray(indices, dtype=numpy.int64) values = numpy.asarray(values, dtype=dtype) shape = numpy.asarray( [len(sequences), numpy.asarray(indices).max(0)[1] + 1], dtype=numpy.int64) return indices, values, shape def get_input_lens(sequences): lengths = numpy.asarray([58 for s in sequences], dtype=numpy.int64) return sequences, lengths def placeholder_inputs(image_width, image_height, channels): images_placeholder = tf.placeholder( tf.float32, [None, image_height, image_width, channels]) labels_placeholder = tf.sparse_placeholder(tf.int32) seqlen_placeholder = tf.placeholder(tf.int32, [None]) keep_prob = tf.placeholder(tf.float32) return images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob def format_batch(data_set, batch_size, image_height, image_width, channels): batch = data_set.next_batch(batch_size) images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) # [batch_size, height * width] => [batch_size, height, width, channels] xs = xs.reshape(batch_size, image_height, image_width, channels) xs = xs.astype(numpy.float32) xs = xs / 255. ys = labels return xs, ys def fill_feed_dict(xs, ys, images_pl, labels_pl, seqlen_pl, keep_prob, train=True): images_feed, seqlen_feed = get_input_lens(xs) labels_feed = sparse_tuple_from_label(ys) if train: feed_dict = { images_pl: images_feed, labels_pl: labels_feed, seqlen_pl: seqlen_feed, keep_prob: 0.5, } else: feed_dict = { images_pl: images_feed, labels_pl: labels_feed, seqlen_pl: seqlen_feed, keep_prob: 1, } return feed_dict def do_eval(sess, dense_decoded, lastbatch_err, learning_rate, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, train, xs, ys): true_count = 0 # Counts the number of correct predictions. feed_dict = fill_feed_dict(xs, ys, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, train) dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate], feed_dict=feed_dict) #accuracy calculation for i, origin_label in enumerate(ys): decoded_label = [j for j in dd[i] if j != -1] if i < 10: logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format( worker_name, i, origin_label, decoded_label)) if origin_label == decoded_label: true_count += 1 #accuracy acc = true_count * 1.0 / len(ys) #print subsummary logging.info( "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" % (worker_name, acc, lerr, lr)) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Generate placeholders for the images, labels and seqlens. images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob = placeholder_inputs( IMAGE_WIDTH, IMAGE_HEIGHT, CHANNELS) # Build a Graph that computes predictions from the inference model. #images_lp, seqlen_lp, num_features, num_layers, hidden_units logits = cnn_lstm_ctc_ocr.inference(images_placeholder, seqlen_placeholder, keep_prob, args.hidden_units, args.mode, args.batch_size) # Add to the Graph the Ops for loss calculation. #logits, labels_lp, seqlen_lp loss = cnn_lstm_ctc_ocr.loss(logits, labels_placeholder, seqlen_placeholder) tf.summary.scalar('loss', loss) # global counter global_step = tf.Variable(0, name='global_step', trainable=False) # Add to the Graph the Ops that calculate and apply gradients. #loss, initial_learning_rate, decay_steps, decay_rate, momentum train_op, learning_rate = cnn_lstm_ctc_ocr.training( loss, global_step, args.initial_learning_rate, args.decay_steps, args.decay_rate, args.momentum) # Add the Op to compare the logits to the labels during evaluation. dense_decoded, lerr = cnn_lstm_ctc_ocr.evaluation( logits, labels_placeholder, seqlen_placeholder) tf.summary.scalar('lerr', lerr) summary_op = tf.summary.merge_all() # Add the variable initializer Op. init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) logging.info("{0} tensorflow model path: {1}".format( worker_name, logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=60) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. validation_xs = None validation_ys = None validation_batchs = 10 with sv.managed_session(server.target) as sess: logging.info("{0} session ready".format(worker_name)) # Loop until the supervisor shuts down or 1000000 steps have completed. g_step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") # for do_eval samples if None == validation_xs or None == validation_ys: validation_xs, validation_ys = format_batch( tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) while not sv.should_stop() and not tf_feed.should_stop( ) and g_step < (args.steps * args.epochs - validation_batchs): # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. start_time = time.time() # using feed_dict xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) feed_dict = fill_feed_dict(xs, ys, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, args.mode == "train") # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value, g_step = sess.run([train_op, loss, global_step], feed_dict=feed_dict) duration = time.time() - start_time if g_step % 20 == 0: # Print status to stdout. logging.info( '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)' % (worker_name, g_step, g_step / args.steps, args.epochs, g_step % args.steps, args.steps, loss_value, duration)) # Write the summaries and print an overview fairly often. if g_step % 100 == 0: # Update the events file. if sv.is_chief: summary = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, g_step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps: # Evaluate against the validation set. logging.info('{0} ---- Validation Data Eval: ----'.format( worker_name)) do_eval(sess, dense_decoded, lerr, learning_rate, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, args.mode == "train", validation_xs, validation_ys) if sv.should_stop() or g_step >= (args.steps * args.epochs - validation_batchs): logging.info("{0} terminating tf_feed".format(worker_name)) tf_feed.terminate() # Ask for all the services to stop. logging.info("{0} stopping supervisor".format(worker_name)) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index # Parameters IMAGE_PIXELS = 28 hidden_units = 128 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) # Create generator for Spark data feed tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1)[0] image = numpy.array(batch[0]) image = image.astype(numpy.float32) / 255.0 label = numpy.array(batch[1]) label = label.astype(numpy.int64) yield (image, label) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Dataset for input data ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))).batch(args.batch_size) iterator = ds.make_one_shot_iterator() x, y_ = iterator.get_next() # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # # Placeholders or QueueRunner/Readers for input data # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") # y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 while not sv.should_stop() and not tf_feed.should_stop( ) and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. if args.mode == "train": _, summary, step = sess.run( [train_op, summary_op, global_step]) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run( [label, prediction, accuracy]) results = [ "{0} Label: {1}, Prediction: {2}".format( datetime.now().isoformat(), l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import os import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index # Parameters IMAGE_PIXELS = 28 hidden_units = 128 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def _parse_csv(ln): splits = tf.string_split([ln], delimiter='|') lbl = splits.values[0] img = splits.values[1] image_defaults = [[0.0] for col in range(IMAGE_PIXELS * IMAGE_PIXELS)] image = tf.stack(tf.decode_csv(img, record_defaults=image_defaults)) norm = tf.constant(255, dtype=tf.float32, shape=(784, )) normalized_image = tf.div(image, norm) label_value = tf.string_to_number(lbl, tf.int32) label = tf.one_hot(label_value, 10) return (normalized_image, label, label_value) def _parse_tfr(example_proto): print("example_proto: {}".format(example_proto)) feature_def = { "label": tf.FixedLenFeature(10, tf.int64), "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64) } features = tf.parse_single_example(example_proto, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(tf.to_float(features['image']), norm) label = tf.to_float(features['label']) return (image, label) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Dataset for input data image_dir = TFNode.hdfs_path(ctx, args.images) file_pattern = os.path.join(image_dir, 'part-*') files = tf.gfile.Glob(file_pattern) parse_fn = _parse_tfr if args.format == 'tfr' else _parse_csv ds = tf.data.TextLineDataset(files).map(parse_fn).batch( args.batch_size) iterator = ds.make_initializable_iterator() x, y_, y_val = iterator.get_next() # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) output_dir = TFNode.hdfs_path(ctx, args.output) tf.gfile.MkDir(output_dir) output_file = tf.gfile.Open("{0}/part-{1:05d}".format( output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. sess.run(iterator.initializer) step = 0 count = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if args.mode == "train": if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step, yv = sess.run( [train_op, summary_op, global_step, y_val]) # print("yval: {}".format(yv)) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, pred, acc = sess.run([label, prediction, accuracy]) # print("label: {0}, pred: {1}".format(labels, pred)) print("acc: {0}".format(acc)) for i in range(len(labels)): count += 1 output_file.write("{0} {1}\n".format( labels[i], pred[i])) print("count: {0}".format(count)) if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time import re worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec NUM_CLASSES = 100 IMAGE_PIXELS = 32 NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. TOWER_NAME = 'tower' # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs / 255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): print("In a TFCluster.") # global_step = tf.train.get_or_create_global_step() # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * 3], name='x-input') y_ = tf.placeholder(tf.float32, [None, 100], name='y-input') images = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 3]) print(images.shape) tf.summary.image('input', images, 10) def _activation_summary(x): """Helper to create summaries for activations. Creates a summary that provides a histogram of activations. Creates a summary that measures the sparsity of activations. Args: x: Tensor Returns: nothing """ # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) tf.summary.histogram(tensor_name + '/activations', x) tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x)) def _variable_on_cpu(name, shape, initializer): """Helper to create a Variable stored on CPU memory. Args: name: name of the variable shape: list of ints initializer: initializer for Variable Returns: Variable Tensor """ with tf.device('/cpu:0'): dtype = tf.float32 var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) return var def _variable_with_weight_decay(name, shape, stddev, wd): """Helper to create an initialized Variable with weight decay. Note that the Variable is initialized with a truncated normal distribution. A weight decay is added only if one is specified. Args: name: name of the variable shape: list of ints stddev: standard deviation of a truncated Gaussian wd: add L2Loss weight decay multiplied by this float. If None, weight decay is not added for this Variable. Returns: Variable Tensor """ dtype = tf.float32 var = _variable_on_cpu( name, shape, tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) if wd is not None: weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') tf.add_to_collection('losses', weight_decay) return var with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 256], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.0)) pre_activation = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv1) # pool1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') # norm1 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # conv2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 256, 128], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [128], tf.constant_initializer(0.1)) pre_activation = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv2) # norm2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # pool2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') # local3 with tf.variable_scope('local3') as scope: # Move everything into depth so we can perform a single matrix multiply. reshape = tf.contrib.layers.flatten(pool2) dim = reshape.get_shape()[1].value weights = _variable_with_weight_decay('weights', shape=[dim, 1024], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [1024], tf.constant_initializer(0.1)) local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name) _activation_summary(local3) # local4 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[1024, 256], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1)) local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name) _activation_summary(local4) # linear layer(WX + b), # We don't apply softmax here because # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits # and performs the softmax internally for efficiency. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [256, NUM_CLASSES], stddev=1 / 256.0, wd=0.0) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name) _activation_summary(softmax_linear) logits = softmax_linear # Calculate the average cross entropy loss across the batch. # labels = tf.reshape(y_, [100, 10]) print(y_.shape) print(logits.shape) labels = tf.cast(y_, tf.int64) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits, name='cross_entropy_per_example') cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # The total loss is defined as the cross entropy loss plus all of the weight # decay terms (L2 loss). total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') global_step = tf.Variable(0) inc = tf.assign_add(global_step, 1, name='increment') # num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size # decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. # lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, # global_step, # decay_steps, # LEARNING_RATE_DECAY_FACTOR, # staircase=True) # tf.summary.scalar('learning_rate', lr) train_step = tf.train.AdamOptimizer(1e-4).minimize(total_loss) correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(logits, 1, name="prediction") ########################################################## # Merge all the summaries and write them out to # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) merged = tf.summary.merge_all() # saver = tf.train.Saver() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS # logdir = TFNode.hdfs_path(ctx, args.model) logdir = "/tmp/" + args.model print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=summary_writer, global_step=global_step, stop_grace_secs=300, saver=None # save_model_secs=10 ) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = -1 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") tf_feed_test = TFNode.DataFeed(ctx.mgr, args.mode != "train") while step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # print (args.steps) # print (sv.should_stop()) # print (tf_feed.should_stop()) step = step + 1 # print (step) temp = sess.run(global_step) # print (temp) # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) test_xs, test_ys = feed_dict( tf_feed_test.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} # print (len(batch_xs) > 0) if len(batch_xs) > 0: if args.mode == "train": summary, _, _ = sess.run([merged, train_step, inc], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): labels, preds, acc = sess.run( [label, prediction, accuracy], feed_dict={ x: test_xs, y_: test_ys }) for l, p in zip(labels, preds): print( "{0} step: {1} accuracy: {2}, Label: {3}, Prediction: {4}" .format(datetime.now().isoformat(), temp, acc, l, p)) # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] # tf_feed.batch_results(results) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run( [label, prediction, accuracy], feed_dict=feed) results = [ "{0} Label: {1}, Prediction: {2}".format( datetime.now().isoformat(), l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) # Train CIFAR-10 for a number of steps. with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval', """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train', """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) def eval_once(saver, summary_writer, top_k_op, summary_op): """Run Eval once. Args: saver: Saver. summary_writer: Summary writer. top_k_op: Top K op. summary_op: Summary op. """ with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] else: print('No checkpoint file found') return # Start the queue runners. coord = tf.train.Coordinator() try: threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend(qr.create_threads(sess, coord=coord, daemon=True, start=True)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * FLAGS.batch_size step = 0 while step < num_iter and not coord.should_stop(): predictions = sess.run([top_k_op]) true_count += np.sum(predictions) step += 1 # Compute precision @ 1. precision = true_count / total_sample_count print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='Precision @ 1', simple_value=precision) summary_writer.add_summary(summary, global_step) except Exception as e: # pylint: disable=broad-except coord.request_stop(e) coord.request_stop() coord.join(threads, stop_grace_period_secs=10) def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, top_k_op, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs) #cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) evaluate()
def main_fun(args, ctx): import tensorflow as tf import argparse import time import os from six.moves import cPickle from model import Model from tensorflowonspark import TFNode from datetime import datetime import numpy as np worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) if job_name == "ps": server.join() else: with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): model = Model(args) # instrument for tensorboard saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() logdir = TFNode.hdfs_path(args.save_dir, ctx.defaultFS, ctx.working_dir) print("tensorflow model path: {0}".format(logdir)) summary_writer = TFNode.get_summary_writer(ctx) sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=model.global_step, stop_grace_secs=300, save_model_secs=10) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format( datetime.now().isoformat())) state=sess.run(model.initial_state) # Loop until the supervisor shuts down or 1000000 steps have completed. step=0 tf_feed=TFNode.DataFeed(ctx.mgr, True) while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch = tf_feed.next_batch(args.batch_size) batch_xs = np.asarray([data[0] for data in batch]) batch_ys = np.asarray([data[1] for data in batch]) feed={model.input_data: batch_xs, model.targets: batch_ys} for i, (c, h) in enumerate(model.initial_state): feed[c]=state[i].c feed[h]=state[i].h if len(batch_xs) > 0: # instrument for tensorboard summ, train_loss, state, _, step = sess.run( [summary_op, model.cost, model.final_state, model.train_op, model.global_step], feed_dict=feed) # print loss print("Step: {}, train_loss: {}".format(step, train_loss)) if sv.is_chief: summary_writer.add_summary(summ, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import pprint import numpy as np import tensorflow as tf import online_model import tfos_online_data_reader sys.argv = argv flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_integer('batch_size', 100, 'data batch size') flags.DEFINE_integer('num_epoch', 1, 'train epoches for dataset ') flags.DEFINE_string('mapping_data', 'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022_map', 'id mapping path') flags.DEFINE_string('train_data', 'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022', 'train data path') #flags.DEFINE_string('mapping_data', # 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022_map', # 'id mapping path') #flags.DEFINE_string('train_data', # 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022', # 'train data path') flags.DEFINE_string('log_dir', 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/model', 'log directory') flags.DEFINE_float('linear_lr', 0.1, 'wide part learning rate. default 0.1') flags.DEFINE_float('dnn_lr', 0.001, 'deep part learning rate. default 0.001') flags.DEFINE_string('linear_optimizer', 'ftrl', 'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is ftrl') flags.DEFINE_string('dnn_optimizer', 'adagrad', 'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is adagrad') flags.DEFINE_integer('input_dim', 13, 'input dimension') flags.DEFINE_string("model_network", "100,20", "The neural network of model, as 100,50,20") flags.DEFINE_string("model_type", "wide_deep", "model type: wide | deep | wide_deep") flags.DEFINE_integer('display_step', 200, 'display_step') flags.DEFINE_integer('ps_num', '64', 'Comma-separated list of hostname:port pairs') flags.DEFINE_integer('task_num', '128', 'Comma-separated list of hostname:port pairs') pprint.PrettyPrinter().pprint(FLAGS.__flags) cluster_spec, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": total_file_names = parse_files(FLAGS.train_data) print("total_file_names:") print(total_file_names) print("task_index: " + str(ctx.task_index)) task_file_names = [name for idx, name in enumerate(total_file_names) if idx % FLAGS.task_num == ctx.task_index] print("task_file_names:") print(task_file_names) train_reader = tfos_online_data_reader.Reader( task_file_names, FLAGS.mapping_data, batch_size=FLAGS.batch_size, delimiter='\t') wide_dim = train_reader.wide_dim with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d"%ctx.task_index, cluster=cluster_spec)): config = {} config['num_ps'] = FLAGS.ps_num dnn_model = online_model.DNNModel(FLAGS,wide_dim,config) dnn_model.build() dense_inputs = dnn_model.dense_inputs sparse_inputs = dnn_model.sparse_inputs labels = dnn_model.labels global_step = dnn_model.global_step step_update_op = dnn_model.step_update_op train_op = dnn_model.train_op loss = dnn_model.loss auc_op = dnn_model.auc_op summary_op = dnn_model.summary_op saver = tf.train.Saver() init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()] summary_writer = tf.summary.FileWriter("tensorboard_%d" % ctx.worker_num, graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief = (ctx.task_index == 0), logdir = FLAGS.log_dir, init_op = init_op, summary_op = None, summary_writer=summary_writer, global_step = global_step, saver=saver, save_model_secs = 300) shape = np.array([FLAGS.batch_size, wide_dim + 1]) begin_time = datetime.now() with sv.managed_session(server.target) as sess: if not sv.should_stop(): for epoch in range(FLAGS.num_epoch): train_batches = train_reader.yieldBatches() print("Epoch: %d" % epoch) step = 0 for dense_x,sparse_idx,sparse_values,y in train_batches: start_time = datetime.now() _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op], feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y}) step += 1 assert not np.isnan(train_loss), 'Model diverged with loss = NaN' time_used = datetime.now() - start_time if step % FLAGS.display_step == 0: g_step, = sess.run([global_step]) print("step: " + str(step) + ", global_step: " + str(g_step)) summary_writer.add_summary(summ,g_step) print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format( g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc)) sys.stdout.flush() total_time = datetime.now() - begin_time print("Training Done!!") print("Total time used: {}".format(total_time))
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import numpy import tensorflow as tf import time import math worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [images_labels] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0: 4]) labels.append(item[4]) xs = numpy.array(images) xs = xs.astype(numpy.float32) ys = dense_to_one_hot(numpy.array(labels, dtype=numpy.uint), 3) ys = ys.astype(numpy.uint8) return (xs, ys) def dense_to_one_hot(labels_dense, num_classes): """Convert class labels from scalars to one-hot vectors.""" num_labels = labels_dense.shape[0] index_offset = numpy.arange(num_labels) * num_classes labels_one_hot = numpy.zeros((num_labels, num_classes)) tt = index_offset + labels_dense.ravel() tt = tt.astype(numpy.int32) labels_one_hot.flat[tt] = 1 return labels_one_hot if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # network x = tf.placeholder(tf.float32, [None, 4]) # paras W = tf.Variable(tf.zeros([4, 3])) b = tf.Variable(tf.zeros([3])) y = tf.nn.softmax(tf.matmul(x, W) + b) y_ = tf.placeholder(tf.float32, [None, 3]) # loss func cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) global_step = tf.Variable(0) train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy,global_step=global_step ) # Test trained model label = tf.argmax(y_, 1, name="label") #??? does the function argmax use in the right way ? prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=1) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import math import six import tensorflow as tf from datasets import dataset_factory from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'batch_size', 100, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'max_num_batches', None, 'Max number of batches to evaluate by default use all.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'checkpoint_path', '/tmp/tfmodel/', 'The directory where the model was written to or an absolute path to a ' 'checkpoint file.') tf.app.flags.DEFINE_string( 'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'test', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to evaluate.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') tf.app.flags.DEFINE_integer( 'eval_image_size', None, 'Eval image size') FLAGS = tf.app.flags.FLAGS if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') cluster_spec, server = TFNode.start_cluster_server(ctx) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): #tf_global_step = slim.get_or_create_global_step() tf_global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in six.iteritems(names_to_values): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string( 'train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) # Train CIFAR-10 for a number of steps. with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[ tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook() ], config=tf.ConfigProto(log_device_placement=FLAGS. log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import getpass import math import numpy import os import signal import tensorflow as tf import time IMAGE_PIXELS=28 worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = 100 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of csv image filenames tf_record_pattern = os.path.join(image_dir, 'part-*') images = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "images: {0}".format(images)) image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue") # Setup queue of csv label filenames tf_record_pattern = os.path.join(label_dir, 'part-*') labels = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "labels: {0}".format(labels)) label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue") # Setup reader for image queue img_reader = tf.TextLineReader(name="img_reader") _, img_csv = img_reader.read(image_queue) image_defaults = [ [1.0] for col in range(784) ] img = tf.pack(tf.decode_csv(img_csv, image_defaults)) # Normalize values to [0,1] norm = tf.constant(255, dtype=tf.float32, shape=(784,)) image = tf.div(img, norm) print_log(worker_num, "image: {0}".format(image)) # Setup reader for label queue label_reader = tf.TextLineReader(name="label_reader") _, label_csv = label_reader.read(label_queue) label_defaults = [ [1.0] for col in range(10) ] label = tf.pack(tf.decode_csv(label_csv, label_defaults)) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch_csv") def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of TFRecord filenames tf_record_pattern = os.path.join(path, 'part-*') files = tf.gfile.Glob(tf_record_pattern) queue_name = "file_queue" # split input files across workers, if specified if task_index is not None and num_workers is not None: num_files = len(files) files = files[task_index:num_files:num_workers] queue_name = "file_queue_{0}".format(task_index) print_log(worker_num, "files: {0}".format(files)) file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name) # Setup reader for examples reader = tf.TFRecordReader(name="reader") _, serialized = reader.read(file_queue) feature_def = {'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) } features = tf.parse_single_example(serialized, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784,)) image = tf.div(tf.to_float(features['image']), norm) print_log(worker_num, "image: {0}".format(image)) label = tf.to_float(features['label']) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch") if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs index = task_index if args.mode == "inference" else None workers = num_workers if args.mode == "inference" else None if args.format == "csv": images = TFNode.hdfs_path(ctx, args.images) labels = TFNode.hdfs_path(ctx, args.labels) x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers) elif args.format == "tfr": images = TFNode.hdfs_path(ctx, args.images) x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers) else: raise("{0} format not supported for tf input mode".format(args.format)) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) output_dir = TFNode.hdfs_path(ctx, args.output) output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 count = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if args.mode == "train": if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step = sess.run([train_op, summary_op, global_step]) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, pred, acc = sess.run([label, prediction, accuracy]) #print("label: {0}, pred: {1}".format(labels, pred)) print("acc: {0}".format(acc)) for i in range(len(labels)): count += 1 output_file.write("{0} {1}\n".format(labels[i], pred[i])) print("count: {0}".format(count)) if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import math import six import tensorflow as tf from datasets import dataset_factory from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer('batch_size', 100, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'max_num_batches', None, 'Max number of batches to evaluate by default use all.') tf.app.flags.DEFINE_string('master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'checkpoint_path', '/tmp/tfmodel/', 'The directory where the model was written to or an absolute path to a ' 'checkpoint file.') tf.app.flags.DEFINE_string('eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_string('dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string('dataset_split_name', 'test', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string('model_name', 'inception_v3', 'The name of the architecture to evaluate.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') tf.app.flags.DEFINE_integer('eval_image_size', None, 'Eval image size') FLAGS = tf.app.flags.FLAGS if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') cluster_spec, server = TFNode.start_cluster_server(ctx) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): #tf_global_step = slim.get_or_create_global_step() tf_global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in six.iteritems(names_to_values): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec if job_name == "ps": time.sleep((worker_num + 1) * 5) batch_size = args.batch_size cluster, server = TFNode.start_cluster_server(ctx, 1) def feed_dict(batch): images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) x_initial = numpy.array(images) x_objdump = x_initial[:,519:719] x_cnn = numpy.empty((0, 200), dtype=numpy.float64) for i in xrange(len(images)): x_cnn_batch = numpy.zeros((200, 120), dtype=numpy.float64) for j in xrange(0, 200): x_cnn_batch[j, int(x_objdump[i, j])] = True x_cnn_batch = numpy.transpose(x_cnn_batch) x_cnn = numpy.append(x_cnn, x_cnn_batch, axis=0) x_peinfo = x_initial[:,0:519] ys = numpy.array(labels) return (x_peinfo.reshape(-1,519,1,1),x_cnn.reshape(-1, 200, 120, 1), ys) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_1(x): return tf.nn.avg_pool(x, ksize=[1, 2,1, 1], strides=[1, 2, 1, 1], padding='SAME') def max_pool_2(x): return tf.nn.avg_pool(x, ksize=[1, 100,1, 1], strides=[1, 100, 1, 1], padding='SAME') if job_name == "ps": server.join() elif job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Build NN-Network W_mlp_1 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_1") b_mlp_1 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_1") tf.summary.histogram("W_mlp_1", W_mlp_1) W_mlp_2 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_2") b_mlp_2 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_2") tf.summary.histogram("W_mlp_2", W_mlp_2) W_conv1 = tf.Variable(tf.truncated_normal([3,120,1,3],stddev=0.1), name="W_conv1") b_conv1 = tf.Variable(tf.constant(0.1, shape=[3]),name="b_conv1") tf.summary.histogram("W_conv1", W_conv1) W_conv2 = tf.Variable(tf.truncated_normal([3,120,3,6],stddev=0.1),name="W_conv2") b_conv2 = tf.Variable(tf.constant(0.1, shape=[6]),name="b_conv2") tf.summary.histogram("W_conv2", W_conv2) sm_w = tf.Variable(tf.truncated_normal([1239, 10], stddev= 0.1), name="sm_w") sm_b = tf.Variable(tf.constant(0.1, shape=[10]),name="sm_b") tf.summary.histogram("softmax_weights", sm_w) x_cnn = tf.placeholder(tf.float32, [None, 200,120,1], name="x_cnn") x_mlp = tf.placeholder(tf.float32, [None, 519,1,1], name="x_mlp") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") tf.summary.image("x_cnn", x_cnn) tf.summary.image("x_mlp", x_mlp) x_mlp_new = tf.reshape(x_mlp, [-1, 519]) h_mlp_1 = tf.nn.xw_plus_b(x_mlp_new, W_mlp_1, b_mlp_1) h_mlp_2 = tf.nn.xw_plus_b(h_mlp_1, W_mlp_2, b_mlp_2) h_conv1 = tf.nn.relu(conv2d(x_cnn, W_conv1) + b_conv1) h_pool1 = max_pool_1(h_conv1) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = max_pool_2(h_conv2) h_conv2_flat = tf.reshape(h_pool2, [-1, 120*6]) h_inter = tf.concat([h_mlp_2, h_conv2_flat],1) y = tf.nn.softmax(tf.nn.xw_plus_b(h_inter, sm_w, sm_b)) global_step = tf.Variable(0) loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.001).minimize( loss, global_step=global_step) label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: batch_mlp, batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) if (step % 10 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) elif args.mode == "inference": labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["Label: {0}, Prediction: {1}".format(l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) else: preds= sess.run(prediction, feed_dict={x_mlp: batch_mlp, x_cnn: batch_xs}) results = ["Sha256: {0}, Prediction: {1}".format(l, p) for l,p in zip(batch_ys,preds)] tf_feed.batch_results(results) print(results) if sv.should_stop() or step >= args.steps: tf_feed.terminate() print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf from tensorflow.python.ops import control_flow_ops from datasets import dataset_factory from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'num_gpus', '1', 'The number of GPUs to use per node') tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'train_dir', '/tmp/tfmodel/', 'Directory where checkpoints and event logs are written to.') tf.app.flags.DEFINE_integer('num_clones', 1, 'Number of model clones to deploy.') tf.app.flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.') tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.') tf.app.flags.DEFINE_integer( 'num_ps_tasks', 0, 'The number of parameter servers. If the value is 0, then the parameters ' 'are handled locally by the worker.') tf.app.flags.DEFINE_integer( 'num_readers', 4, 'The number of parallel readers that read data from the dataset.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_integer( 'log_every_n_steps', 10, 'The frequency with which logs are print.') tf.app.flags.DEFINE_integer( 'save_summaries_secs', 600, 'The frequency with which summaries are saved, in seconds.') tf.app.flags.DEFINE_integer( 'save_interval_secs', 600, 'The frequency with which the model is saved, in seconds.') tf.app.flags.DEFINE_integer( 'task', 0, 'Task id of the replica running the training.') ###################### # Optimization Flags # ###################### tf.app.flags.DEFINE_float( 'weight_decay', 0.00004, 'The weight decay on the model weights.') tf.app.flags.DEFINE_string( 'optimizer', 'rmsprop', 'The name of the optimizer, one of "adadelta", "adagrad", "adam",' '"ftrl", "momentum", "sgd" or "rmsprop".') tf.app.flags.DEFINE_float( 'adadelta_rho', 0.95, 'The decay rate for adadelta.') tf.app.flags.DEFINE_float( 'adagrad_initial_accumulator_value', 0.1, 'Starting value for the AdaGrad accumulators.') tf.app.flags.DEFINE_float( 'adam_beta1', 0.9, 'The exponential decay rate for the 1st moment estimates.') tf.app.flags.DEFINE_float( 'adam_beta2', 0.999, 'The exponential decay rate for the 2nd moment estimates.') tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.') tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5, 'The learning rate power.') tf.app.flags.DEFINE_float( 'ftrl_initial_accumulator_value', 0.1, 'Starting value for the FTRL accumulators.') tf.app.flags.DEFINE_float( 'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.') tf.app.flags.DEFINE_float( 'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.') tf.app.flags.DEFINE_float( 'momentum', 0.9, 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') ####################### # Learning Rate Flags # ####################### tf.app.flags.DEFINE_string( 'learning_rate_decay_type', 'exponential', 'Specifies how the learning rate is decayed. One of "fixed", "exponential",' ' or "polynomial"') tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') tf.app.flags.DEFINE_float( 'end_learning_rate', 0.0001, 'The minimal end learning rate used by a polynomial decay learning rate.') tf.app.flags.DEFINE_float( 'label_smoothing', 0.0, 'The amount of label smoothing.') tf.app.flags.DEFINE_float( 'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.') tf.app.flags.DEFINE_float( 'num_epochs_per_decay', 2.0, 'Number of epochs after which learning rate decays.') tf.app.flags.DEFINE_bool( 'sync_replicas', False, 'Whether or not to synchronize the replicas during training.') tf.app.flags.DEFINE_integer( 'replicas_to_aggregate', 1, 'The Number of gradients to collect before updating params.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') ####################### # Dataset Flags # ####################### tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'train', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to train.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_integer( 'batch_size', 32, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'train_image_size', None, 'Train image size') tf.app.flags.DEFINE_integer('max_number_of_steps', None, 'The maximum number of training steps.') ##################### # Fine-Tuning Flags # ##################### tf.app.flags.DEFINE_string( 'checkpoint_path', None, 'The path to a checkpoint from which to fine-tune.') tf.app.flags.DEFINE_string( 'checkpoint_exclude_scopes', None, 'Comma-separated list of scopes of variables to exclude when restoring ' 'from a checkpoint.') tf.app.flags.DEFINE_string( 'trainable_scopes', None, 'Comma-separated list of scopes to filter the set of variables to train.' 'By default, None would train all the variables.') tf.app.flags.DEFINE_boolean( 'ignore_missing_vars', False, 'When restoring a checkpoint would ignore missing variables.') FLAGS = tf.app.flags.FLAGS FLAGS.job_name = ctx.job_name FLAGS.task = ctx.task_index FLAGS.num_clones = FLAGS.num_gpus FLAGS.worker_replicas = len(ctx.cluster_spec['worker']) assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0)) def _configure_learning_rate(num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) if FLAGS.sync_replicas: decay_steps /= FLAGS.replicas_to_aggregate if FLAGS.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif FLAGS.learning_rate_decay_type == 'fixed': return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate') elif FLAGS.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError('learning_rate_decay_type [%s] was not recognized', FLAGS.learning_rate_decay_type) def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if FLAGS.optimizer is not recognized. """ if FLAGS.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate, rho=FLAGS.adadelta_rho, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=FLAGS.ftrl_learning_rate_power, initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value, l1_regularization_strength=FLAGS.ftrl_l1, l2_regularization_strength=FLAGS.ftrl_l2) elif FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=FLAGS.momentum, name='Momentum') elif FLAGS.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.momentum, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer) return optimizer def _add_variables_summaries(learning_rate): summaries = [] for variable in slim.get_model_variables(): summaries.append(tf.summary.histogram(variable.op.name, variable)) summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate)) return summaries def _get_init_fn(): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if FLAGS.checkpoint_path is None: return None # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. if tf.train.latest_checkpoint(FLAGS.train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % FLAGS.train_dir) return None exclusions = [] if FLAGS.checkpoint_exclude_scopes: exclusions = [scope.strip() for scope in FLAGS.checkpoint_exclude_scopes.split(',')] # TODO(sguada) variables.filter_variables() variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Fine-tuning from %s' % checkpoint_path) return slim.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=FLAGS.ignore_missing_vars) def _get_variables_to_train(): """Returns a list of variables to train. Returns: A list of variables to train by the optimizer. """ if FLAGS.trainable_scopes is None: return tf.trainable_variables() else: scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')] variables_to_train = [] for scope in scopes: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) variables_to_train.extend(variables) return variables_to_train # main cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma) if ctx.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step #with tf.device(deploy_config.variables_device()): # global_step = slim.create_global_step() with tf.device("/job:ps/task:0"): global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph()) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec IMAGE_PIXELS=28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): import numpy import os import tensorflow as tf import tensorflow.contrib.keras as keras from tensorflow.contrib.keras.api.keras import backend as K from tensorflow.contrib.keras.api.keras.models import Sequential, load_model, save_model from tensorflow.contrib.keras.api.keras.layers import Dense, Dropout from tensorflow.contrib.keras.api.keras.optimizers import RMSprop from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback, TensorBoard from tensorflow.python.saved_model import builder as saved_model_builder from tensorflow.python.saved_model import tag_constants from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def from tensorflowonspark import TFNode cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": def generate_rdd_data(tf_feed, batch_size): print("generate_rdd_data invoked") while True: batch = tf_feed.next_batch(batch_size) imgs = [] lbls = [] for item in batch: imgs.append(item[0]) lbls.append(item[1]) images = numpy.array(imgs).astype('float32') / 255 labels = numpy.array(lbls).astype('float32') yield (images, labels) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): IMAGE_PIXELS = 28 batch_size = 100 num_classes = 10 # the data, shuffled and split between train and test sets if args.input_mode == 'tf': from tensorflow.contrib.keras.api.keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) else: # args.mode == 'spark' x_train = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x_train") y_train = tf.placeholder(tf.float32, [None, 10], name="y_train") model = Sequential() model.add(Dense(512, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(10, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) saver = tf.train.Saver() with tf.Session(server.target) as sess: K.set_session(sess) def save_checkpoint(epoch, logs=None): if epoch == 1: tf.train.write_graph(sess.graph.as_graph_def(), args.model_dir, 'graph.pbtxt') saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=epoch * args.steps_per_epoch) ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint) tb_callback = TensorBoard(log_dir=args.model_dir, histogram_freq=1, write_graph=True, write_images=True) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [ckpt_callback, tb_callback ] if ctx.task_index == 0 else None if args.input_mode == 'tf': # train & validate on in-memory data history = model.fit(x_train, y_train, batch_size=batch_size, epochs=args.epochs, verbose=1, validation_data=(x_test, y_test), callbacks=callbacks) else: # args.input_mode == 'spark': # train on data read from a generator which is producing data from a Spark RDD tf_feed = TFNode.DataFeed(ctx.mgr) history = model.fit_generator( generator=generate_rdd_data(tf_feed, batch_size), steps_per_epoch=args.steps_per_epoch, epochs=args.epochs, verbose=1, callbacks=callbacks) if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0: # save a local Keras model, so we can reload it with an inferencing learning_phase save_model(model, "tmp_model") # reload the model K.set_learning_phase(False) new_model = load_model("tmp_model") # export a saved_model for inferencing builder = saved_model_builder.SavedModelBuilder( args.export_dir) signature = predict_signature_def( inputs={'images': new_model.input}, outputs={'scores': new_model.output}) builder.add_meta_graph_and_variables( sess=sess, tags=[tag_constants.SERVING], signature_def_map={'predict': signature}, clear_devices=True) builder.save() if args.input_mode == 'spark': tf_feed.terminate()
def map_fun(args, ctx): num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma') def _parse_tfr(example_proto): feature_def = { "label": tf.FixedLenFeature(10, tf.int64), "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64) } features = tf.parse_single_example(example_proto, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(tf.to_float(features['image']), norm) label = tf.to_float(features['label']) return (image, label) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # read from saved tf records images = TFNode.hdfs_path(ctx, args.tfrecord_dir) tf_record_pattern = os.path.join(images, 'part-*') tfr_files = tf.gfile.Glob(tf_record_pattern) ds = tf.data.TFRecordDataset(tfr_files) parse_fn = _parse_tfr ds = ds.shard(num_workers, task_index).repeat(args.epochs).shuffle( args.shuffle_size) ds = ds.map(parse_fn).batch(args.batch_size) iterator = ds.make_initializable_iterator() x, y_ = iterator.get_next() x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model_dir) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) sess.run(iterator.initializer) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step = sess.run( [train_op, summary_op, global_step]) if sv.is_chief: summary_writer.add_summary(summary, step) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf from tensorflow.contrib.layers.python.layers import batch_norm import time import os worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS = 2 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels = 3 num_class = 2 # global dropout dropout = args.dropout # Parameters # hidden_units = 128 # NN隐藏层 # training_epochs=args.epochs batch_size = args.batch_size #每批次训练的样本数 # img_nums=630000 # global learning_rate # learning_rate=args.learning_rate INITIAL_LEARNING_RATE = args.learning_rate # flag=True # batch_size=200 num_examples_per_epoch_for_train = (4015 - 1)**2 # 每次迭代的样本数 num_batches_per_epoch = int(num_examples_per_epoch_for_train / batch_size) num_epochs_per_decay = 1.2 learning_rate_decay_rate = 0.8 learning_rate_decay_steps = int(num_batches_per_epoch * num_epochs_per_decay) """ # ---------设置动态学习效率 # Constants describing the training process. # MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = batch_size # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. global_step1 = training_epochs * (img_nums // batch_size) # Integer Variable counting the number of training steps # Variables that affect learning rate. num_batches_per_epoch = img_nums / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step1, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # 设置动态学习效率---------- """ # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] if args.mode != 'inference': numpy.random.shuffle(batch) # 随机打乱 for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) # xs = xs/255.0 # 数据归一化 # Z-score标准化方法 # mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1]) # std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1]) # xs = (xs - mean) / std # min-max标准化(Min-Max Normalization max_ = numpy.reshape(numpy.max(xs, 1), [numpy.shape(xs)[0], 1]) min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1]) xs = (xs - min_) / (max_ - min_) ys = numpy.array(labels) if args.mode != 'inference': ys = ys.astype(numpy.uint8) else: ys = ys.astype(numpy.uint16) return (xs, ys) def batch_norm_layer(inputT, is_training=True, scope=None): # Note: is_training is tf.placeholder(tf.bool) type return tf.cond(is_training, lambda: batch_norm(inputT, is_training=True, center=True, scale=True, activation_fn=tf.nn.relu, decay=0.9, scope=scope), lambda: batch_norm(inputT, is_training=False, center=True, scale=True, activation_fn=tf.nn.relu, decay=0.9, scope=scope)) # , reuse = True)) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool( x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.get_variable('wc1', [3, 3, channels, 128], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.get_variable('wc2', [3, 3, 32, 64], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), # fully connected, 7*7*64 inputs, 1024 outputs 'wd1': tf.Variable( tf.random_normal([ (IMAGE_PIXELS // 2) * (IMAGE_PIXELS // 2) * 128, 1024 ])), # 1024 inputs, 10 outputs (class prediction) 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.get_variable('bc1', [128], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), 'bc2': tf.get_variable('bc2', [64], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), 'bd1': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x") # mnist 28*28*1 if args.mode != 'inference': y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") else: y_ = tf.placeholder(tf.float32, [None, 4], name="y_") label = y_ keep = tf.placeholder(tf.float32) is_training = tf.placeholder(tf.bool, name='MODE') x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels ]) # mnist 数据 28x28x1 (灰度图 波段为1) # x_img=batch_norm_layer(x_img,is_training) x_img = tf.nn.lrn(x_img, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75) # lrn层 # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) # shape [N,1,1,32] conv1 = tf.nn.lrn(conv1, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75) # lrn层 # conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) # conv2 = maxpool2d(conv2, k=2) # shape [N,1,1,32] # conv1 = tf.nn.dropout(conv1, keep+0.1) fc1 = tf.reshape(conv1, [-1, weights['wd1'].get_shape().as_list()[0]]) fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) # fc1=batch_norm_layer(fc1, is_training) fc1 = tf.nn.relu(fc1) fc1 = tf.nn.dropout(fc1, keep) y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) prediction = tf.argmax(y, 1, name="prediction") # y=tf.sigmoid(y) # 二分类 多分类加 tf.nn.softmax() global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) if args.mode != 'inference': loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) # learning_rate=tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step, # learning_rate_decay_steps,learning_rate_decay_rate, # staircase=False) # learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, # global_step, # 10000, # 0.96, # staircase=False) learning_rate = tf.train.polynomial_decay( INITIAL_LEARNING_RATE, global_step, 3000000, 1e-5, 0.8, True) # 运行steps:decay_steps>1000:1 # train_op = tf.train.AdagradOptimizer(learning_rate).minimize( # loss, global_step=global_step) train_op = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") # prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # saver=None, # None 不自动保存模型 # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) elif args.mode == "retrain": sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, # init_op=init_op, # summary_op=None, # saver=None, # None 不自动保存模型 saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session """ # 验证之前是否已经保存了检查点文件 ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess,ckpt.model_checkpoint_path) """ # global_step=int(ckpt.model_checkpoint_path.rsplit('-',1)[1]) # else: # sess.run(init_op) print("{0} session ready".format(datetime.now().isoformat())) # log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 # acc1=args.acc # n = 0 tf_feed = TFNode.DataFeed( ctx.mgr, args.mode == "train" or args.mode == "retrain") while not sv.should_stop() and not tf_feed.should_stop( ) and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = { x: batch_xs, y_: batch_ys, keep: dropout, is_training: True } if len(batch_xs) > 0: if args.mode == "train" or args.mode == "retrain": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) ''' if dropout > 0.2: if step%10000==0:dropout=dropout*0.85 else: dropout=0.7 ''' """ acc=sess.run(accuracy,{x: batch_xs, y_: batch_ys,keep:1.}) if acc>acc1: if flag and acc>0.9: os.popen('hdfs dfs -rm -r '+logdir+'/*') # 清空hdfs上面文件夹下的所有文件 flag=False # acc1=acc # 训练达到一定程度加上 saver.save(sess,logdir+'/'+args.model_name,global_step=step) n=0 # learning_rate=1e-3 # dropout=.7 else: n += 1 if n > 100: ckpt1 = tf.train.get_checkpoint_state(logdir) if ckpt1 and ckpt1.model_checkpoint_path: saver.restore(sess, ckpt1.model_checkpoint_path) if learning_rate > 1e-7: # learning_rate = learning_rate * .96**(step/10) learning_rate = learning_rate * .8 else: learning_rate = 1e-3 if dropout > 0.2: dropout = dropout * .85 else: dropout = .7 """ # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run( accuracy, { x: batch_xs, y_: batch_ys, keep: 1., is_training: False }))) # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) elif args.mode == 'test': feed2 = { x: batch_xs, y_: batch_ys, keep: 1., is_training: False } labels, preds, acc = sess.run( [label, prediction, accuracy], feed_dict=feed2) results = [ "{0} Label: {1}, Prediction: {2}".format( datetime.now().isoformat(), l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) print("acc: {0}".format(acc)) else: # args.mode == "inference" feed2 = { x: batch_xs, y_: batch_ys, keep: 1., is_training: False } # labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed2) labels, preds = sess.run([label, prediction], feed_dict=feed2) # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] results = [ "Label: {0}, Prediction: {1}".format(l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) # print("acc: {0}".format(acc)) # log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) # log.info("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index IMAGE_PIXELS = 28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma') def feed_dict(batch): # Convert from dict of named arrays to two numpy arrays of the proper type images = batch['image'] labels = batch['label'] xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs / 255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model_dir) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() if sv.is_chief and args.export_dir: print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir)) # exported signatures defined in code signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { 'inputs': {'image': x}, 'outputs': {'prediction': prediction}, 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME }, 'featurize': { 'inputs': {'image': x}, 'outputs': {'features': hid}, 'method_name': 'featurize' } } TFNode.export_saved_model(sess, args.export_dir, tf.saved_model.tag_constants.SERVING, signatures) else: # non-chief workers should wait for chief while not sv.should_stop(): print("Waiting for chief") time.sleep(5) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval', """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string( 'checkpoint_dir', '/tmp/cifar10_train', """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) def eval_once(saver, summary_writer, top_k_op, summary_op): """Run Eval once. Args: saver: Saver. summary_writer: Summary writer. top_k_op: Top K op. summary_op: Summary op. """ with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] else: print('No checkpoint file found') return # Start the queue runners. coord = tf.train.Coordinator() try: threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * FLAGS.batch_size step = 0 while step < num_iter and not coord.should_stop(): predictions = sess.run([top_k_op]) true_count += np.sum(predictions) step += 1 # Compute precision @ 1. precision = true_count / total_sample_count print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='Precision @ 1', simple_value=precision) summary_writer.add_summary(summary, global_step) except Exception as e: # pylint: disable=broad-except coord.request_stop(e) coord.request_stop() coord.join(threads, stop_grace_period_secs=10) def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, top_k_op, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs) #cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) evaluate()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels=3 num_class=2 dropout = 0.5 learning_rate=1e-6 # Parameters hidden_units = 128 # NN隐藏层 training_epochs=args.epochs img_nums=630000 #batch_size = args.batch_size #每批次训练的样本数 batch_size=200 """ # ---------设置动态学习效率 # Constants describing the training process. # MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = batch_size # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. global_step1 = training_epochs * (img_nums // batch_size) # Integer Variable counting the number of training steps # Variables that affect learning rate. num_batches_per_epoch = img_nums / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step1, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # 设置动态学习效率---------- """ # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] numpy.random.shuffle(batch) # 随机打乱 for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) #xs = xs/255.0 # 数据归一化 # Z-score标准化方法 #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1]) #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1]) #xs = (xs - mean) / std # min-max标准化(Min-Max Normalization max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1]) min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1]) xs=(xs-min_)/(max_-min_) ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 def maxpool2d2(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='VALID') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])), 'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # fully connected, 7*7*64 inputs, 1024 outputs # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])), # 1024 inputs, 10 outputs (class prediction) # 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.get_variable('bc1',[64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), 'bc2': tf.get_variable('bc2',[128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bc3': tf.Variable(tf.random_normal([128])), 'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bd1': tf.Variable(tf.random_normal([1024])), # 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x") # mnist 28*28*1 y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") # keep=tf.placeholder(tf.float32) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1) # tf.summary.image("x_img", x_img) # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) # conv1 = tf.nn.dropout(conv1, keep) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) conv2 = tf.nn.dropout(conv2, dropout) # conv3 = conv2d(conv2, weights['wc3'], biases['bc3']) # conv3 = tf.nn.dropout(conv3, keep) conv4 = conv2d(conv2, weights['wc4'], biases['bc4']) conv4 = maxpool2d2(conv4, k=2) y = tf.reshape(conv4, [-1, num_class]) # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) # fc1 = tf.nn.relu(fc1) # if args.mode == "train" or args.mode == "retrain": # fc1 = tf.nn.dropout(fc1, dropout) # y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) # global_step = tf.Variable(0) global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y)) # tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(learning_rate).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=1) elif args.mode == "retrain": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session print("{0} session ready".format(datetime.now().isoformat())) # log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train" or args.mode == "retrain": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) # log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_integer('num_gpus', 1, """How many GPUs to use.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss def average_gradients(tower_grads): """Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: tower_grads: List of lists of (gradient, variable) tuples. The outer list is over individual gradients. The inner list is over the gradient calculation for each tower. Returns: List of pairs of (gradient, variable) where the gradient has been averaged across all towers. """ average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) train()