def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) while True: for i in range(20): eval_once(saver, summary_writer, top_k_op, summary_op,i) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def tower_loss(scope): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build inference Graph. logits = cifar10.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = cifar10.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name) tf.contrib.deprecated.scalar_summary(loss_name, l) return total_loss
def evaluate(): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): # Get images and labels for CIFAR-10. eval_data = eval_data == 'test' images, labels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate predictions. top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = {} for v in tf.all_variables(): if v in tf.trainable_variables(): restore_name = variable_averages.average_name(v) else: restore_name = v.op.name variables_to_restore[restore_name] = v saver = tf.train.Saver(variables_to_restore) while True: eval_once(saver, top_k_op) if run_once: break time.sleep(eval_interval_secs)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.contrib.framework.get_or_create_global_step() # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() #images, labels = cifar10.inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) class _LoggerHook(tf.train.SessionRunHook): """Logs loss and runtime.""" def begin(self): self._step = -1 def before_run(self, run_context): self._step += 1 self._start_time = time.time() return tf.train.SessionRunArgs(loss) # Asks for loss value. def after_run(self, run_context, run_values): duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op)
def evaluate(): images, labels = cifar10.inputs(eval_data=True) logits = cifar10.inference(images) top_k_op = tf.nn.in_top_k(logits, labels, 1) variable_averages = tf.train.ExponentialMovingAverage(cifar10.MOVING_AVERAGE_DECAY) variables_to_restore = {} for v in tf.all_variables(): if v in tf.trainable_variables(): restore_name = variable_averages.average_name(v) else: restore_name = v.op.name variables_to_restore[restore_name] = v saver = tf.train.Saver(variables_to_restore) eval_once(saver, top_k_op)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph_def=sess.graph_def) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): # ops global_step = tf.Variable(0, trainable=False) images, labels = cifar10.distorted_inputs() logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE)) loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) summary_op = tf.merge_all_summaries() with tf.Session() as sess: saver = tf.train.Saver(tf.all_variables(), max_to_keep=21) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) # restore or initialize variables ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.initialize_all_variables()) # Start the queue runners. tf.train.start_queue_runners(sess=sess) start = sess.run(global_step) for step in xrange(start, FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % 500 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() eval_data = FLAGS.eval_data == 'test' #timages, tlabels = cifar10.inputs(eval_data=eval_data) # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) #tlogits = cifar10.inference(timages) # Calculate loss. top_k_op = tf.nn.in_top_k(logits, labels, 1) loss = cifar10.loss(logits, labels) #precision = tf.Variable(0.8, name='precision') #tf.scalar_summary('accuracy', precision) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) sess.graph.finalize() summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 100 == 0: # Build a Graph that computes the logits predictions from the # inference model. # Calculate predictions. num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) true_count = 0 # Counts the number of correct predictions. total_sample_count = num_iter * FLAGS.batch_size i_step = 0 while i_step < num_iter: predictions = sess.run([top_k_op]) true_count += np.sum(predictions) i_step += 1 #Compute precision @ 1. #sess.run(precision.assign(true_count / total_sample_count)) prec = true_count / total_sample_count print(prec) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='accuracy', simple_value=prec) summary_writer.add_summary(summary, step) #summary_str = sess.run(summary_op) #summary_writer.add_summary(summary_str, step) #summary_writer.flush() # Save the model checkpoint periodically. if step % 100 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. train_op = cifar10.train(loss, global_step) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0) summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1) summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2) summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3) summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4) summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5) summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6) summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7) summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8) summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9) summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10) summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11) summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12) summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13) summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14) summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15) summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16) summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17) summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18) summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary_writer0.add_summary(summary_str, step) summary_writer1.add_summary(summary_str, step) summary_writer2.add_summary(summary_str, step) summary_writer3.add_summary(summary_str, step) summary_writer4.add_summary(summary_str, step) summary_writer5.add_summary(summary_str, step) summary_writer6.add_summary(summary_str, step) summary_writer7.add_summary(summary_str, step) summary_writer8.add_summary(summary_str, step) summary_writer9.add_summary(summary_str, step) summary_writer10.add_summary(summary_str, step) summary_writer11.add_summary(summary_str, step) summary_writer12.add_summary(summary_str, step) summary_writer13.add_summary(summary_str, step) summary_writer14.add_summary(summary_str, step) summary_writer15.add_summary(summary_str, step) summary_writer16.add_summary(summary_str, step) summary_writer17.add_summary(summary_str, step) summary_writer18.add_summary(summary_str, step) summary_writer19.add_summary(summary_str, step) # Save the model checkpoint periodically. # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: # checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') # saver.save(sess, checkpoint_path, global_step=step/100) # hard cord here!!! if step==100: checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==200: checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==300: checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==400: checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==500: checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==600: checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==700: checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==800: checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==900: checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1000: checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1100: checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1200: checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1300: checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1400: checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1500: checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1600: checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1700: checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1800: checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==1900: checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) if step==2000: checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) """Train CIFAR-10 for a number of steps.""" cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) device_setter = tf.train.replica_device_setter(cluster=cluster) with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary('learning_rate', lr) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), replica_id=FLAGS.task_id, total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # Compute gradients with respect to the loss. grads = opt.compute_gradients(loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print ("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print ("Before session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print ('Started %d queues for processing input data.' % len(queue_runners)) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) print ('Start training') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
reader = tf.TextLineReader() key, value = reader.read(filename_queue) batch_size = 128 min_fraction_of_examples_in_queue = 0.4 num_examples_per_epoch = 50000 min_queue_examples = int(num_examples_per_epoch * min_fraction_of_examples_in_queue) images_batch, label_batch =\ tf.train.shuffle_batch_join([read_example(value) for _ in range(9)], batch_size=batch_size, capacity=min_queue_examples + 3*batch_size, min_after_dequeue=min_queue_examples) logits = cifar10.inference(images_batch) loss = cifar10.loss(logits, label_batch) global_step = tf.Variable(0, trainable=False) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() sess = tf.Session() summary_writer = tf.train.SummaryWriter('./train',
import os cifar10.NUM_CLASSES = 6 FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/model.ckpt', """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_string('download_url', 'http://', """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('image_size', 32, """Image size.""") tf.app.flags.DEFINE_integer('port', 5000, """Application port.""") images = tf.placeholder(tf.float32, shape=(1, FLAGS.image_size, FLAGS.image_size, 3)) logits = tf.nn.softmax(cifar10.inference(images)) sess = tf.Session() saver = tf.train.Saver(tf.all_variables()) if not os.path.isfile(FLAGS.checkpoint_path): print 'No checkpoint file found' print urllib.urlretrieve(FLAGS.download_url, FLAGS.checkpoint_path) saver.restore(sess, FLAGS.checkpoint_path) app = Flask(__name__) app.debug = True @app.route('/', methods=['POST']) def api(): results = []
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=1) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) train_op = cifar10.train(loss, global_step) saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print ("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print ("Session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print ('Started %d queues for processing input data.' % len(queue_runners)) """Train CIFAR-10 for a number of steps.""" for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)