def end(self, session): if self.mode != 'train': return print("{} ======= Exporting to: {}".format(datetime.now().isoformat(), self.export_dir)) signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { 'inputs': { 'image': self.input_tensor }, 'outputs': { 'prediction': self.output_tensor }, 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME } } # 保存和导出模型 TFNode.export_saved_model(session, self.export_dir, tf.saved_model.tag_constants.SERVING, signatures) print("{} ======= Done exporting".format(datetime.now().isoformat()))
def _tf_export(args): """Creates an inference graph w/ placeholder and loads weights from checkpoint""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker x = tf.placeholder(tf.float32, [None, 2], name='x') w = tf.Variable(tf.truncated_normal([2,1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors saver = tf.train.Saver() with tf.Session() as sess: # load graph from a checkpoint ckpt = tf.train.get_checkpoint_state(args.model_dir) assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir) saver.restore(sess, ckpt.model_checkpoint_path) # exported signatures defined in code signatures = { 'test_key': { 'inputs': { 'features': x }, 'outputs': { 'prediction': y, 'pred2': y2 }, 'method_name': 'test' } } TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures)
def sample(args, sc): defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") working_dir = os.getcwd() config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir) saved_args = sc.pickleFile(config_file).collect()[0] chars_vocab_file = TFNode.hdfs_path( os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir) chars, vocab = sc.pickleFile(chars_vocab_file).collect() model = Model(saved_args, training=False) with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver() save_dir = TFNode.hdfs_path(os.path.join(args.save_dir, ''), defaultFS, working_dir) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) sample_ = model.sample(sess, chars, vocab, args.n, args.prime, args.sample) with hdfs.open( TFNode.hdfs_path( os.path.join(args.output_dir, 'output.txt'), defaultFS, working_dir), 'w') as f: f.write(sample_)
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() with tf.train.MonitoredTrainingSession( is_chief=(ctx.task_index == 0)) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: outputs = sess.run([sq], feed_dict={x: batch}) tf_feed.batch_results(outputs[0]) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding")
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.build_model() self.execute()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.float32, [None, 2], name='x') y_ = tf.placeholder(tf.float32, [None, 1], name='y_') w = tf.Variable(tf.truncated_normal([2,1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost) init_op = tf.global_variables_initializer() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sv.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if args.input_mapping: if len(batch['x']) > 0: feed = { x: batch['x'], y_: batch['y_'] } opt = sess.run(optimizer, feed_dict=feed) if sv.is_chief: if args.model_dir: # manually save checkpoint ckpt_name = args.model_dir + "/model.ckpt" print("Saving checkpoint to: {}".format(ckpt_name)) saver.save(sess, ckpt_name) elif args.export_dir: # export a saved_model signatures = { 'test_key': { 'inputs': { 'features': x }, 'outputs': { 'prediction': y }, 'method_name': 'test' } } TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures) else: print("WARNING: model state not saved.") sv.stop()
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.create_tmp_dir() self.process() self.delete_tmp_dir()
def export_fun(args): """Define/export a single-node TF graph for inferencing""" # Input placeholder for inferencing x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) prediction = tf.argmax(y, 1, name="prediction") saver = tf.train.Saver() with tf.Session() as sess: # load graph from a checkpoint logging.info("model path: {}".format(args.model_dir)) ckpt = tf.train.get_checkpoint_state(args.model_dir) logging.info("ckpt: {}".format(ckpt)) assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir) saver.restore(sess, ckpt.model_checkpoint_path) logging.info("Exporting saved_model to: {}".format(args.export_dir)) # exported signatures defined in code signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { 'inputs': {'image': x}, 'outputs': {'prediction': prediction}, 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME }, 'featurize': { 'inputs': {'image': x}, 'outputs': {'features': hid}, 'method_name': 'featurize' } } TFNode.export_saved_model(sess, args.export_dir, tf.saved_model.tag_constants.SERVING, signatures) logging.info("Exported saved_model")
def test_datafeed(self): mgr = TFManager.start('abc', ['input', 'output'], 'local') # insert 10 numbers followed by an end-of-feed marker q = mgr.get_queue('input') for i in range(10): q.put(i) q.put(None) feed = TFNode.DataFeed(mgr) # [0,1] self.assertFalse(feed.done_feeding) batch = feed.next_batch(2) self.assertEqual(2, len(batch)) self.assertEqual(1, sum(batch)) # [2,3,4,5] batch = feed.next_batch(4) self.assertEqual(4, len(batch)) self.assertEqual(14, sum(batch)) # [6,7,8,9] batch = feed.next_batch(10) self.assertEqual(4, len(batch)) self.assertEqual(30, sum(batch)) # should be done self.assertTrue(feed.should_stop())
def feed_dict(mgr, batch_size): tmp = TFNode.next_batch(mgr, batch_size) # extract TFRecords, since tmp array is [(TFRecord, None)] tfrecords = [] for elem in tmp: tfrecords.append(str(elem[0])) return tfrecords
def test_datafeed(self): """TFNode.DataFeed basic operations""" mgr = TFManager.start('abc', ['input', 'output'], 'local') # insert 10 numbers followed by an end-of-feed marker q = mgr.get_queue('input') for i in range(10): q.put(i) q.put(None) # end-of-feed marker feed = TFNode.DataFeed(mgr) # [0,1] self.assertFalse(feed.done_feeding) batch = feed.next_batch(2) self.assertEqual(len(batch), 2) self.assertEqual(sum(batch), 1) # [2,3,4,5] self.assertFalse(feed.done_feeding) batch = feed.next_batch(4) self.assertEqual(len(batch), 4) self.assertEqual(sum(batch), 14) # [6,7,8,9] self.assertFalse(feed.done_feeding) batch = feed.next_batch(10) # ask for more than available self.assertEqual(len(batch), 4) self.assertEqual(sum(batch), 30) # should be done self.assertTrue(feed.should_stop())
def test_hdfs_path(self): """Normalization of absolution & relative string paths depending on filesystem""" cwd = os.getcwd() user = getpass.getuser() fs = ["file://", "hdfs://", "viewfs://"] paths = { "hdfs://foo/bar": ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"], "viewfs://foo/bar": ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"], "file://foo/bar": ["file://foo/bar", "file://foo/bar", "file://foo/bar"], "/foo/bar": ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"], "foo/bar": [ "file://{}/foo/bar".format(cwd), "hdfs:///user/{}/foo/bar".format(user), "viewfs:///user/{}/foo/bar".format(user) ], } for i in range(len(fs)): ctx = type('MockContext', (), { 'defaultFS': fs[i], 'working_dir': cwd }) for path, expected in paths.items(): final_path = TFNode.hdfs_path(ctx, path) self.assertEqual( final_path, expected[i], "fs({}) + path({}) => {}, expected {}".format( fs[i], path, final_path, expected[i]))
def _tf_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph( ) # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) def _get_examples(batch_size): """Generate test data (mocking a queue_runner of file inputs)""" features = tf.random_uniform([batch_size, 2]) # (batch_size x 2) weights = tf.constant([[3.14], [1.618]]) # (2, 1) labels = tf.matmul(features, weights) return features, labels if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x, y_ = _get_examples( 10 ) # no input placeholders, TF code reads (or in this case "generates") input w = tf.Variable(tf.truncated_normal([2, 1]), name='w') y = tf.matmul(x, w, name='y') global_step = tf.Variable(0) cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer( 0.5).minimize(cost, global_step) init_op = tf.global_variables_initializer() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) step = 0 with sv.managed_session(server.target) as sess: while not sv.should_stop() and step < args.steps: opt, weights, step = sess.run( [optimizer, w, global_step]) if (step % 100 == 0): print("step: {}, weights: {}".format( step, weights)) if sv.is_chief: if args.model_dir: # manually save checkpoint ckpt_name = args.model_dir + "/model.ckpt" print("Saving checkpoint to: {}".format(ckpt_name)) saver.save(sess, ckpt_name) sv.stop()
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) raise Exception("FAKE exception during feeding")
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(batch_size=10) print("batch: {}".format(batch)) squares = tf.math.square(batch) print("squares: {}".format(squares)) tf_feed.batch_results(squares.numpy())
def end(self, session): print("{} ======= Exporting to: {}".format( datetime.now().isoformat(), self.export_dir)) signatures = { "test_key": { 'inputs': { 'features': self.input_tensor }, 'outputs': { 'prediction': self.output_tensor }, 'method_name': tf.saved_model.signature_constants. PREDICT_METHOD_NAME } } TFNode.export_saved_model(session, self.export_dir, "test_tag", signatures) print("{} ======= Done exporting".format( datetime.now().isoformat()))
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding")
def end(self, session): logging.info("{} ======= Exporting to: {}".format( datetime.now().isoformat(), self.export_dir)) signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { 'inputs': { 'image': self.input_tensor }, 'outputs': { 'prediction': self.output_tensor }, 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME } } TFNode.export_saved_model(session, self.export_dir + '_' + str(random.random()), tf.saved_model.tag_constants.SERVING, signatures) logging.info("{} ====== Done exporting".format( datetime.now().isoformat()))
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run( [sq], feed_dict={x: tf_feed.next_batch(10)}) tf_feed.batch_results(outputs[0]) sv.stop()
def main(_): # restore graph/session from checkpoint sess = tf.Session(graph=tf.get_default_graph()) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) saver = tf.train.import_meta_graph(ckpt + '.meta', clear_devices=True) saver.restore(sess, ckpt) g = sess.graph # if --show, dump out all operations in this graph if FLAGS.show: for o in g.get_operations(): print("{:>64}\t{}".format(o.name, o.type)) if FLAGS.export_dir and FLAGS.signatures: # load/parse JSON signatures if ':' in FLAGS.signatures: # assume JSON string, since unix filenames shouldn't contain colons signatures = json.loads(FLAGS.signatures) else: # assume JSON file with open(FLAGS.signatures) as f: signatures = json.load(f) # convert string input/output values with actual tensors from graph for name, sig in signatures.items(): for k, v in sig['inputs'].items(): tensor_name = v if v.endswith(':0') else v + ':0' sig['inputs'][k] = g.get_tensor_by_name(tensor_name) for k, v in sig['outputs'].items(): tensor_name = v if v.endswith(':0') else v + ':0' sig['outputs'][k] = g.get_tensor_by_name(tensor_name) # export a saved model TFNode.export_saved_model(sess, FLAGS.export_dir, tf.saved_model.tag_constants.SERVING, signatures)
def save_model(sess, args, x, prediction): """ 保存模型 """ pb_folder_dir = args.export_dir + constants.PATH_SEP + constants.PB_FOLDER_NAME # exported signatures defined in code signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { "inputs": {constants.SIG_INPUT: x}, "outputs": {constants.SIG_OUTPUT: prediction}, "method_name": tf.saved_model.signature_constants.PREDICT_METHOD_NAME } } TFNode.export_saved_model(sess, pb_folder_dir, tf.saved_model.tag_constants.SERVING, signatures) # 转为单个pb t = Thread(target=tensorflow_utils.convert_as_single_pb, args=[pb_folder_dir, constants.PREDICT_NODE_NAME, args.export_dir + constants.PATH_SEP + constants.PB_NAME]) t.start() t.join()
def main_fun(argv, ctx): import tensorflow as tf worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec, server = TFNode.start_cluster_server(ctx) ''' if job_name == "ps": time.sleep((worker_num + 1) * 5) if job_name == "ps": server.join() elif job_name == "worker":''' hello = tf.constant('Hello, TensorFlow!') sess = tf.Session() print(sess.run(hello))
def test_hdfs_path(self): cwd = os.getcwd() user = getpass.getuser() fs = ["file://", "hdfs://", "viewfs://"] paths = { "hdfs://foo/bar": ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"], "viewfs://foo/bar": ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"], "file://foo/bar": ["file://foo/bar", "file://foo/bar", "file://foo/bar"], "/foo/bar": ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"], "foo/bar": ["file://{}/foo/bar".format(cwd), "hdfs:///user/{}/foo/bar".format(user), "viewfs:///user/{}/foo/bar".format(user)], } for i in range(len(fs)): ctx = type('MockContext', (), {'defaultFS': fs[i], 'working_dir': cwd}) for path, expected in paths.items(): final_path = TFNode.hdfs_path(ctx, path) self.assertEqual(expected[i], final_path, "fs({}) + path({}) => {}, expected {}".format(fs[i], path, final_path, expected[i]))
def __init__(self, sc, data_dir, batch_size, seq_length, encoding='utf-8'): self.data_dir = data_dir self.batch_size = batch_size self.seq_length = seq_length self.encoding = encoding defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") working_dir = os.getcwd() input_file = TFNode.hdfs_path(os.path.join(data_dir, "input.txt"), defaultFS, working_dir) print("reading text file") self.preprocess(input_file) self.create_batches() self.reset_batch_pointer()
def main_fun(argv, ctx): from src import facenet_distributed_train from src import vipus_distributed_train import sys job_name = ctx.job_name assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' print("argv:", argv) sys.argv = argv cluster, server = TFNode.start_cluster_server(ctx, num_gpus=1) if job_name == 'ps': server.join() else: if argv.model == 'FACENET': facenet_distributed_train.train(server, ctx.cluster_spec, argv, ctx) elif argv.model == 'VIPUS': vipus_distributed_train.train(server, ctx.cluster_spec, argv, ctx)
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) }) tf_feed.batch_results(outputs[0]) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf from inception import inception_eval from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx) inception_eval.evaluate(dataset)
def main_fun(argv, ctx): # extract node metadata from ctx worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' from inception import inception_distributed_train from inception.imagenet_data import ImagenetData import tensorflow as tf # instantiate FLAGS on workers using argv from driver and add job_name and task_id print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS.job_name = job_name FLAGS.task_id = task_index print("FLAGS:", FLAGS.__dict__['__flags']) # Get TF cluster and server instances cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
def train(target, dataset, cluster_spec, ctx): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are infered from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) # Ops are assigned to worker by default. with tf.device('/job:worker/task:%d' % FLAGS.task_id): # Variables and its related init/assign ops are assigned to ps. with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser(num_parameter_servers)): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. global_step = slim.variables.global_step() # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) if FLAGS.input_mode == 'spark': def feed_dict(mgr, batch_size): tmp = TFNode.next_batch(mgr, batch_size) # extract TFRecords, since tmp array is [(TFRecord, None)] tfrecords = [] for elem in tmp: tfrecords.append(str(elem[0])) return tfrecords batch = tf.placeholder(tf.string, [FLAGS.batch_size/FLAGS.num_preprocess_threads]) # The following is adapted from image_processing.py to remove Readers/QueueRunners. # Note: this removes the RandomShuffledQueue, so the incoming data is not shuffled. # Presumably, this could be done on the Spark side or done in additional TF code. examples = tf.unpack(batch) images, labels = [], [] for example_serialized in examples: for thread_id in range(FLAGS.num_preprocess_threads): # Parse a serialized Example proto to extract the image and metadata. image_buffer, label_index, bbox, _ = image_processing.parse_example_proto(example_serialized) image = image_processing.image_preprocessing(image_buffer, bbox, train, thread_id) images.append(image) labels.append(label_index) height = FLAGS.image_size width = FLAGS.image_size depth = 3 images = tf.cast(images, tf.float32) images = tf.reshape(images, shape=[FLAGS.batch_size, height, width, depth]) tf.summary.image('images', images) labels = tf.reshape(labels, [FLAGS.batch_size]) else: images, labels = image_processing.distorted_inputs( dataset, batch_size=FLAGS.batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 logits = inception.inference(images, num_classes, for_training=True) # Add classification loss. inception.loss(logits, labels) # Gather all of the losses including regularization losses. losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. for l in losses + [total_loss]: loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. tf.summary.scalar(loss_name + ' (raw)', l) tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. for var in variables_to_average: tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) # Add dependency to compute batchnorm_updates. with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners, init_tokens and clean_up_op, which is used to # synchronize replicas. # More details can be found in sync_replicas_optimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, summary_writer=summary_writer, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. next_summary_time = time.time() + FLAGS.save_summaries_secs while not sv.should_stop(): try: start_time = time.time() if FLAGS.input_mode == 'spark': tmp = feed_dict(ctx.mgr, FLAGS.batch_size/FLAGS.num_preprocess_threads) feed = {batch: tmp} loss_value, step = sess.run([train_op, global_step], feed_dict=feed) else: loss_value, step = sess.run([train_op, global_step]) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time if step % 30 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. if FLAGS.input_mode == 'tf' and is_chief and next_summary_time < time.time(): tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('About to execute sync_clean_up_op!') raise # Stop the TFNode data feed if FLAGS.input_mode == 'spark': TFNode.terminate(ctx.mgr) # Stop the supervisor. This also waits for service threads to finish. sv.stop() # Save after the training ends. if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import getpass import math import numpy import os import signal import tensorflow as tf import time IMAGE_PIXELS=28 worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = 100 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of csv image filenames tf_record_pattern = os.path.join(image_dir, 'part-*') images = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "images: {0}".format(images)) image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue") # Setup queue of csv label filenames tf_record_pattern = os.path.join(label_dir, 'part-*') labels = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "labels: {0}".format(labels)) label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue") # Setup reader for image queue img_reader = tf.TextLineReader(name="img_reader") _, img_csv = img_reader.read(image_queue) image_defaults = [ [1.0] for col in range(784) ] img = tf.pack(tf.decode_csv(img_csv, image_defaults)) # Normalize values to [0,1] norm = tf.constant(255, dtype=tf.float32, shape=(784,)) image = tf.div(img, norm) print_log(worker_num, "image: {0}".format(image)) # Setup reader for label queue label_reader = tf.TextLineReader(name="label_reader") _, label_csv = label_reader.read(label_queue) label_defaults = [ [1.0] for col in range(10) ] label = tf.pack(tf.decode_csv(label_csv, label_defaults)) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch_csv") def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of TFRecord filenames tf_record_pattern = os.path.join(path, 'part-*') files = tf.gfile.Glob(tf_record_pattern) queue_name = "file_queue" # split input files across workers, if specified if task_index is not None and num_workers is not None: num_files = len(files) files = files[task_index:num_files:num_workers] queue_name = "file_queue_{0}".format(task_index) print_log(worker_num, "files: {0}".format(files)) file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name) # Setup reader for examples reader = tf.TFRecordReader(name="reader") _, serialized = reader.read(file_queue) feature_def = {'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) } features = tf.parse_single_example(serialized, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784,)) image = tf.div(tf.to_float(features['image']), norm) print_log(worker_num, "image: {0}".format(image)) label = tf.to_float(features['label']) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch") if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs index = task_index if args.mode == "inference" else None workers = num_workers if args.mode == "inference" else None if args.format == "csv": images = TFNode.hdfs_path(ctx, args.images) labels = TFNode.hdfs_path(ctx, args.labels) x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers) elif args.format == "tfr": images = TFNode.hdfs_path(ctx, args.images) x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers) else: raise("{0} format not supported for tf input mode".format(args.format)) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) output_dir = TFNode.hdfs_path(ctx, args.output) output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 count = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if args.mode == "train": if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step = sess.run([train_op, summary_op, global_step]) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, pred, acc = sess.run([label, prediction, accuracy]) #print("label: {0}, pred: {1}".format(labels, pred)) print("acc: {0}".format(acc)) for i in range(len(labels)): count += 1 output_file.write("{0} {1}\n".format(labels[i], pred[i])) print("count: {0}".format(count)) if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels=3 num_class=2 dropout = 0.5 learning_rate=1e-6 # Parameters hidden_units = 128 # NN隐藏层 training_epochs=args.epochs img_nums=630000 #batch_size = args.batch_size #每批次训练的样本数 batch_size=200 """ # ---------设置动态学习效率 # Constants describing the training process. # MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = batch_size # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. global_step1 = training_epochs * (img_nums // batch_size) # Integer Variable counting the number of training steps # Variables that affect learning rate. num_batches_per_epoch = img_nums / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step1, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # 设置动态学习效率---------- """ # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] numpy.random.shuffle(batch) # 随机打乱 for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) #xs = xs/255.0 # 数据归一化 # Z-score标准化方法 #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1]) #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1]) #xs = (xs - mean) / std # min-max标准化(Min-Max Normalization max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1]) min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1]) xs=(xs-min_)/(max_-min_) ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 def maxpool2d2(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='VALID') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])), 'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # fully connected, 7*7*64 inputs, 1024 outputs # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])), # 1024 inputs, 10 outputs (class prediction) # 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.get_variable('bc1',[64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), 'bc2': tf.get_variable('bc2',[128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bc3': tf.Variable(tf.random_normal([128])), 'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bd1': tf.Variable(tf.random_normal([1024])), # 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x") # mnist 28*28*1 y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") # keep=tf.placeholder(tf.float32) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1) # tf.summary.image("x_img", x_img) # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) # conv1 = tf.nn.dropout(conv1, keep) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) conv2 = tf.nn.dropout(conv2, dropout) # conv3 = conv2d(conv2, weights['wc3'], biases['bc3']) # conv3 = tf.nn.dropout(conv3, keep) conv4 = conv2d(conv2, weights['wc4'], biases['bc4']) conv4 = maxpool2d2(conv4, k=2) y = tf.reshape(conv4, [-1, num_class]) # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) # fc1 = tf.nn.relu(fc1) # if args.mode == "train" or args.mode == "retrain": # fc1 = tf.nn.dropout(fc1, dropout) # y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) # global_step = tf.Variable(0) global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y)) # tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(learning_rate).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=1) elif args.mode == "retrain": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session print("{0} session ready".format(datetime.now().isoformat())) # log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train" or args.mode == "retrain": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) # log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.compat.v1.reset_default_graph() strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model = Sequential() model.add(Dense(1, activation='linear', input_shape=[2])) model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) model.summary() tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch['x']) > 0: features = batch['x'][0] label = batch['y_'][0] yield (features, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([2]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # disable auto-sharding dataset options = tf.data.Options() options.experimental_distribute.auto_shard = False ds = ds.with_options(options) # only train 90% of each epoch to account for uneven RDD partition sizes steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers) tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint( filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True) ] model.fit(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy" # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) if ctx.job_name == 'chief' and args.export_dir: print("exporting model to: {}".format(args.export_dir)) tf.keras.experimental.export_saved_model( model, args.export_dir) tf_feed.terminate()
def main_fun(args, ctx): import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflowonspark import TFNode tfds.disable_progress_bar() BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return def input_fn(mode, input_context=None): if mode == tf.estimator.ModeKeys.TRAIN: # Note: Spark is responsible for feeding data via streaming RDD ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) return ds.batch(BATCH_SIZE) else: raise Exception("I'm evaluating: mode={}, input_context={}".format( mode, input_context)) def scale(image, label): image = tf.cast(image, tf.float32) / 255.0 return image, label mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) ds = mnist['test'] if input_context: ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return ds.map(scale).batch(BATCH_SIZE) def serving_input_receiver_fn(): features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') receiver_tensors = {'features': features} return tf.estimator.export.ServingInputReceiver( receiver_tensors, receiver_tensors) def model_fn(features, labels, mode): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) logits = model(features, training=False) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'logits': logits} return tf.estimator.EstimatorSpec(mode, predictions=predictions) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=optimizer.minimize( loss, tf.compat.v1.train.get_or_create_global_step())) # Note: the original example used MultiWorkerMirroredStrategy which is a synchronous training strategy. # Since streaming data arrives irregularly, we must use the asynchronous ParameterServerStrategy # to allow data to be processed as it arrives and to avoid deadlocks. # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() strategy = tf.distribute.experimental.ParameterServerStrategy() config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=args.model_dir, config=config) # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) tf.estimator.train_and_evaluate( classifier, train_spec=tf.estimator.TrainSpec(input_fn=input_fn), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) ) if ctx.job_name == 'chief': print("Exporting saved_model to {}".format(args.export_dir)) classifier.export_saved_model(args.export_dir, serving_input_receiver_fn)
def main_fun(args, ctx): import numpy as np import tensorflow as tf from tensorflowonspark import compat, TFNode strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model # single node # single_worker_model = build_and_compile_cnn_model() # single_worker_model.fit(x=train_datasets, epochs=3) tf_feed = TFNode.DataFeed(ctx.mgr, False) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # this fails # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True) ] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, # so we need to ensure that all workers complete training before any of them run out of data from the RDD. # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, # we'll just stop training at 90% of the total expected number of steps. steps_per_epoch = 60000 / args.batch_size steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers max_steps_per_worker = steps_per_epoch_per_worker * 0.9 multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=max_steps_per_worker, callbacks=callbacks) from tensorflow_estimator.python.estimator.export import export_lib export_dir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief') # terminating feed tells spark to skip processing further partitions tf_feed.terminate()
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_integer('num_gpus', 1, """How many GPUs to use.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss def average_gradients(tower_grads): """Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: tower_grads: List of lists of (gradient, variable) tuples. The outer list is over individual gradients. The inner list is over the gradient calculation for each tower. Returns: List of pairs of (gradient, variable) where the gradient has been averaged across all towers. """ average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) train()
def main_fun(argv, ctx): import math import six import tensorflow as tf from datasets import dataset_factory from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'batch_size', 100, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'max_num_batches', None, 'Max number of batches to evaluate by default use all.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'checkpoint_path', '/tmp/tfmodel/', 'The directory where the model was written to or an absolute path to a ' 'checkpoint file.') tf.app.flags.DEFINE_string( 'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'test', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to evaluate.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') tf.app.flags.DEFINE_integer( 'eval_image_size', None, 'Eval image size') FLAGS = tf.app.flags.FLAGS if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') cluster_spec, server = TFNode.start_cluster_server(ctx) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): #tf_global_step = slim.get_or_create_global_step() tf_global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k( logits, labels, 5), }) # Print the summaries to screen. for name, value in six.iteritems(names_to_values): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train', """Directory where to write event logs """ """and checkpoint.""") tf.app.flags.DEFINE_integer('max_steps', 1000000, """Number of batches to run.""") tf.app.flags.DEFINE_boolean('log_device_placement', False, """Whether to log device placement.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") # cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) # Train CIFAR-10 for a number of steps. with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec IMAGE_PIXELS=28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(argv, ctx): import tensorflow as tf from tensorflow.python.ops import control_flow_ops from datasets import dataset_factory from deployment import model_deploy from nets import nets_factory from preprocessing import preprocessing_factory sys.argv = argv slim = tf.contrib.slim tf.app.flags.DEFINE_integer( 'num_gpus', '1', 'The number of GPUs to use per node') tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.') tf.app.flags.DEFINE_string( 'master', '', 'The address of the TensorFlow master to use.') tf.app.flags.DEFINE_string( 'train_dir', '/tmp/tfmodel/', 'Directory where checkpoints and event logs are written to.') tf.app.flags.DEFINE_integer('num_clones', 1, 'Number of model clones to deploy.') tf.app.flags.DEFINE_boolean('clone_on_cpu', False, 'Use CPUs to deploy clones.') tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.') tf.app.flags.DEFINE_integer( 'num_ps_tasks', 0, 'The number of parameter servers. If the value is 0, then the parameters ' 'are handled locally by the worker.') tf.app.flags.DEFINE_integer( 'num_readers', 4, 'The number of parallel readers that read data from the dataset.') tf.app.flags.DEFINE_integer( 'num_preprocessing_threads', 4, 'The number of threads used to create the batches.') tf.app.flags.DEFINE_integer( 'log_every_n_steps', 10, 'The frequency with which logs are print.') tf.app.flags.DEFINE_integer( 'save_summaries_secs', 600, 'The frequency with which summaries are saved, in seconds.') tf.app.flags.DEFINE_integer( 'save_interval_secs', 600, 'The frequency with which the model is saved, in seconds.') tf.app.flags.DEFINE_integer( 'task', 0, 'Task id of the replica running the training.') ###################### # Optimization Flags # ###################### tf.app.flags.DEFINE_float( 'weight_decay', 0.00004, 'The weight decay on the model weights.') tf.app.flags.DEFINE_string( 'optimizer', 'rmsprop', 'The name of the optimizer, one of "adadelta", "adagrad", "adam",' '"ftrl", "momentum", "sgd" or "rmsprop".') tf.app.flags.DEFINE_float( 'adadelta_rho', 0.95, 'The decay rate for adadelta.') tf.app.flags.DEFINE_float( 'adagrad_initial_accumulator_value', 0.1, 'Starting value for the AdaGrad accumulators.') tf.app.flags.DEFINE_float( 'adam_beta1', 0.9, 'The exponential decay rate for the 1st moment estimates.') tf.app.flags.DEFINE_float( 'adam_beta2', 0.999, 'The exponential decay rate for the 2nd moment estimates.') tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.') tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5, 'The learning rate power.') tf.app.flags.DEFINE_float( 'ftrl_initial_accumulator_value', 0.1, 'Starting value for the FTRL accumulators.') tf.app.flags.DEFINE_float( 'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.') tf.app.flags.DEFINE_float( 'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.') tf.app.flags.DEFINE_float( 'momentum', 0.9, 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') ####################### # Learning Rate Flags # ####################### tf.app.flags.DEFINE_string( 'learning_rate_decay_type', 'exponential', 'Specifies how the learning rate is decayed. One of "fixed", "exponential",' ' or "polynomial"') tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') tf.app.flags.DEFINE_float( 'end_learning_rate', 0.0001, 'The minimal end learning rate used by a polynomial decay learning rate.') tf.app.flags.DEFINE_float( 'label_smoothing', 0.0, 'The amount of label smoothing.') tf.app.flags.DEFINE_float( 'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.') tf.app.flags.DEFINE_float( 'num_epochs_per_decay', 2.0, 'Number of epochs after which learning rate decays.') tf.app.flags.DEFINE_bool( 'sync_replicas', False, 'Whether or not to synchronize the replicas during training.') tf.app.flags.DEFINE_integer( 'replicas_to_aggregate', 1, 'The Number of gradients to collect before updating params.') tf.app.flags.DEFINE_float( 'moving_average_decay', None, 'The decay to use for the moving average.' 'If left as None, then moving averages are not used.') ####################### # Dataset Flags # ####################### tf.app.flags.DEFINE_string( 'dataset_name', 'imagenet', 'The name of the dataset to load.') tf.app.flags.DEFINE_string( 'dataset_split_name', 'train', 'The name of the train/test split.') tf.app.flags.DEFINE_string( 'dataset_dir', None, 'The directory where the dataset files are stored.') tf.app.flags.DEFINE_integer( 'labels_offset', 0, 'An offset for the labels in the dataset. This flag is primarily used to ' 'evaluate the VGG and ResNet architectures which do not use a background ' 'class for the ImageNet dataset.') tf.app.flags.DEFINE_string( 'model_name', 'inception_v3', 'The name of the architecture to train.') tf.app.flags.DEFINE_string( 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 'as `None`, then the model_name flag is used.') tf.app.flags.DEFINE_integer( 'batch_size', 32, 'The number of samples in each batch.') tf.app.flags.DEFINE_integer( 'train_image_size', None, 'Train image size') tf.app.flags.DEFINE_integer('max_number_of_steps', None, 'The maximum number of training steps.') ##################### # Fine-Tuning Flags # ##################### tf.app.flags.DEFINE_string( 'checkpoint_path', None, 'The path to a checkpoint from which to fine-tune.') tf.app.flags.DEFINE_string( 'checkpoint_exclude_scopes', None, 'Comma-separated list of scopes of variables to exclude when restoring ' 'from a checkpoint.') tf.app.flags.DEFINE_string( 'trainable_scopes', None, 'Comma-separated list of scopes to filter the set of variables to train.' 'By default, None would train all the variables.') tf.app.flags.DEFINE_boolean( 'ignore_missing_vars', False, 'When restoring a checkpoint would ignore missing variables.') FLAGS = tf.app.flags.FLAGS FLAGS.job_name = ctx.job_name FLAGS.task = ctx.task_index FLAGS.num_clones = FLAGS.num_gpus FLAGS.worker_replicas = len(ctx.cluster_spec['worker']) assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0)) def _configure_learning_rate(num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) if FLAGS.sync_replicas: decay_steps /= FLAGS.replicas_to_aggregate if FLAGS.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif FLAGS.learning_rate_decay_type == 'fixed': return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate') elif FLAGS.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError('learning_rate_decay_type [%s] was not recognized', FLAGS.learning_rate_decay_type) def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if FLAGS.optimizer is not recognized. """ if FLAGS.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate, rho=FLAGS.adadelta_rho, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=FLAGS.ftrl_learning_rate_power, initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value, l1_regularization_strength=FLAGS.ftrl_l1, l2_regularization_strength=FLAGS.ftrl_l2) elif FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=FLAGS.momentum, name='Momentum') elif FLAGS.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.momentum, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer) return optimizer def _add_variables_summaries(learning_rate): summaries = [] for variable in slim.get_model_variables(): summaries.append(tf.summary.histogram(variable.op.name, variable)) summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate)) return summaries def _get_init_fn(): """Returns a function run by the chief worker to warm-start the training. Note that the init_fn is only run when initializing the model during the very first global step. Returns: An init function run by the supervisor. """ if FLAGS.checkpoint_path is None: return None # Warn the user if a checkpoint exists in the train_dir. Then we'll be # ignoring the checkpoint anyway. if tf.train.latest_checkpoint(FLAGS.train_dir): tf.logging.info( 'Ignoring --checkpoint_path because a checkpoint already exists in %s' % FLAGS.train_dir) return None exclusions = [] if FLAGS.checkpoint_exclude_scopes: exclusions = [scope.strip() for scope in FLAGS.checkpoint_exclude_scopes.split(',')] # TODO(sguada) variables.filter_variables() variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Fine-tuning from %s' % checkpoint_path) return slim.assign_from_checkpoint_fn( checkpoint_path, variables_to_restore, ignore_missing_vars=FLAGS.ignore_missing_vars) def _get_variables_to_train(): """Returns a list of variables to train. Returns: A list of variables to train by the optimizer. """ if FLAGS.trainable_scopes is None: return tf.trainable_variables() else: scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')] variables_to_train = [] for scope in scopes: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) variables_to_train.extend(variables) return variables_to_train # main cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma) if ctx.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step #with tf.device(deploy_config.variables_device()): # global_step = slim.create_global_step() with tf.device("/job:ps/task:0"): global_step = tf.Variable(0, name="global_step") ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph()) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main_fun(argv, ctx): import pprint import numpy as np import tensorflow as tf import online_model import tfos_online_data_reader sys.argv = argv flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_integer('batch_size', 100, 'data batch size') flags.DEFINE_integer('num_epoch', 1, 'train epoches for dataset ') flags.DEFINE_string('mapping_data', 'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022_map', 'id mapping path') flags.DEFINE_string('train_data', 'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022', 'train data path') #flags.DEFINE_string('mapping_data', # 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022_map', # 'id mapping path') #flags.DEFINE_string('train_data', # 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022', # 'train data path') flags.DEFINE_string('log_dir', 'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/model', 'log directory') flags.DEFINE_float('linear_lr', 0.1, 'wide part learning rate. default 0.1') flags.DEFINE_float('dnn_lr', 0.001, 'deep part learning rate. default 0.001') flags.DEFINE_string('linear_optimizer', 'ftrl', 'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is ftrl') flags.DEFINE_string('dnn_optimizer', 'adagrad', 'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is adagrad') flags.DEFINE_integer('input_dim', 13, 'input dimension') flags.DEFINE_string("model_network", "100,20", "The neural network of model, as 100,50,20") flags.DEFINE_string("model_type", "wide_deep", "model type: wide | deep | wide_deep") flags.DEFINE_integer('display_step', 200, 'display_step') flags.DEFINE_integer('ps_num', '64', 'Comma-separated list of hostname:port pairs') flags.DEFINE_integer('task_num', '128', 'Comma-separated list of hostname:port pairs') pprint.PrettyPrinter().pprint(FLAGS.__flags) cluster_spec, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": total_file_names = parse_files(FLAGS.train_data) print("total_file_names:") print(total_file_names) print("task_index: " + str(ctx.task_index)) task_file_names = [name for idx, name in enumerate(total_file_names) if idx % FLAGS.task_num == ctx.task_index] print("task_file_names:") print(task_file_names) train_reader = tfos_online_data_reader.Reader( task_file_names, FLAGS.mapping_data, batch_size=FLAGS.batch_size, delimiter='\t') wide_dim = train_reader.wide_dim with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d"%ctx.task_index, cluster=cluster_spec)): config = {} config['num_ps'] = FLAGS.ps_num dnn_model = online_model.DNNModel(FLAGS,wide_dim,config) dnn_model.build() dense_inputs = dnn_model.dense_inputs sparse_inputs = dnn_model.sparse_inputs labels = dnn_model.labels global_step = dnn_model.global_step step_update_op = dnn_model.step_update_op train_op = dnn_model.train_op loss = dnn_model.loss auc_op = dnn_model.auc_op summary_op = dnn_model.summary_op saver = tf.train.Saver() init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()] summary_writer = tf.summary.FileWriter("tensorboard_%d" % ctx.worker_num, graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief = (ctx.task_index == 0), logdir = FLAGS.log_dir, init_op = init_op, summary_op = None, summary_writer=summary_writer, global_step = global_step, saver=saver, save_model_secs = 300) shape = np.array([FLAGS.batch_size, wide_dim + 1]) begin_time = datetime.now() with sv.managed_session(server.target) as sess: if not sv.should_stop(): for epoch in range(FLAGS.num_epoch): train_batches = train_reader.yieldBatches() print("Epoch: %d" % epoch) step = 0 for dense_x,sparse_idx,sparse_values,y in train_batches: start_time = datetime.now() _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op], feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y}) step += 1 assert not np.isnan(train_loss), 'Model diverged with loss = NaN' time_used = datetime.now() - start_time if step % FLAGS.display_step == 0: g_step, = sess.run([global_step]) print("step: " + str(step) + ", global_step: " + str(g_step)) summary_writer.add_summary(summ,g_step) print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format( g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc)) sys.stdout.flush() total_time = datetime.now() - begin_time print("Training Done!!") print("Total time used: {}".format(total_time))
def main_fun(argv, ctx): import tensorflow as tf import cifar10 sys.argv = argv FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval', """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train', """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""") cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) def eval_once(saver, summary_writer, top_k_op, summary_op): """Run Eval once. Args: saver: Saver. summary_writer: Summary writer. top_k_op: Top K op. summary_op: Summary op. """ with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] else: print('No checkpoint file found') return # Start the queue runners. coord = tf.train.Coordinator() try: threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend(qr.create_threads(sess, coord=coord, daemon=True, start=True)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * FLAGS.batch_size step = 0 while step < num_iter and not coord.should_stop(): predictions = sess.run([top_k_op]) true_count += np.sum(predictions) step += 1 # Compute precision @ 1. precision = true_count / total_sample_count print('%s: precision @ 1 = %.3f' % (datetime.now(), precision)) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='Precision @ 1', simple_value=precision) summary_writer.add_summary(summary, global_step) except Exception as e: # pylint: disable=broad-except coord.request_stop(e) coord.request_stop() coord.join(threads, stop_grace_period_secs=10) def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, top_k_op, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs) #cifar10.maybe_download_and_extract() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) evaluate()