def evaluate(dataset_path): """Evaluate model on Dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): train_dir = Path(FLAGS.checkpoint_dir) reference_shape = mio.import_pickle(train_dir / 'reference_shape.pkl') images, gt_truth, inits, _ = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False) mirrored_images, _, mirrored_inits, shapes = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False, mirror_image=True) print('Loading model...') # Build a Graph that computes the logits predictions from the # inference model. with tf.device(FLAGS.device): patch_shape = (FLAGS.patch_size, FLAGS.patch_size) pred, _, _ = mdm_model.model(images, inits, patch_shape=patch_shape) tf.get_variable_scope().reuse_variables() pred_mirrored, _, _ = mdm_model.model( mirrored_images, mirrored_inits, patch_shape=patch_shape) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, pred], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, gt_truth], [tf.float32]) summaries = [] summaries.append(tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5)) avg_pred = pred + tf.py_func(flip_predictions, (pred_mirrored, shapes), (tf.float32, ))[0] avg_pred /= 2. # Calculate predictions. norm_error = mdm_model.normalized_rmse(avg_pred, gt_truth) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( mdm_train.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_summary(summaries) graph_def = tf.get_default_graph().as_graph_def() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, graph_def=graph_def) while True: _eval_once(saver, summary_writer, norm_error, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def evaluate(dataset_path): """Evaluate model on Dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Get images and labels from the dataset. #reference_shape = mio.import_pickle( # FLAGS.checkpoint_dir + '/reference_shape.pkl') reference_shape = mio.import_pickle('reference_shape.pkl') print(reference_shape.shape) images, gt_truth, inits = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False) print('Loading model...') # Build a Graph that computes the logits predictions from the # inference model. with tf.device(FLAGS.device): pred, _, _ = mdm_model.model(images, inits) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, pred], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, gt_truth], [tf.float32]) summaries = [] summaries.append( tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5)) # Calculate predictions. norm_error = mdm_model.normalized_rmse(pred, gt_truth) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( mdm_train.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_summary(summaries) graph_def = tf.get_default_graph().as_graph_def() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, graph_def=graph_def) while True: _eval_once(saver, summary_writer, norm_error, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def train(scope=''): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) train_dirs = FLAGS.datasets.split(':') # Calculate the learning rate schedule. num_batches_per_epoch = 100 num_epochs_per_decay = 5 decay_steps = int(num_batches_per_epoch * num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads _images, _shapes, _reference_shape, pca_model = \ data_provider.load_images(train_dirs) reference_shape = tf.constant(_reference_shape, dtype=tf.float32, name='reference_shape') image_shape = _images[0].shape lms_shape = _shapes[0].points.shape def get_random_sample(rotation_stddev=10): idx = np.random.randint(low=0, high=len(_images)) im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False) lms = _shapes[idx] im.landmarks['PTS'] = lms if np.random.rand() < .5: im = utils.mirror_image(im) if np.random.rand() < .5: theta = np.random.normal(scale=rotation_stddev) rot = menpo.transform.rotate_ccw_about_centre(lms, theta) im = im.warp_to_shape(im.shape, rot) pixels = im.pixels.transpose(1, 2, 0).astype('float32') shape = im.landmarks['PTS'].lms.points.astype('float32') return pixels, shape image, shape = tf.py_func(get_random_sample, [], [tf.float32, tf.float32]) initial_shape = data_provider.random_shape(shape, reference_shape, pca_model) image.set_shape(image_shape) shape.set_shape(lms_shape) initial_shape.set_shape(lms_shape) image = data_provider.distort_color(image) images, lms, inits = tf.train.batch([image, shape, initial_shape], FLAGS.batch_size, dynamic_pad=False, capacity=5000, enqueue_many=False, num_threads=num_preprocess_threads, name='batch') print('Defining model...') with tf.device(FLAGS.train_device): # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) predictions, dxs, _ = mdm_model.model(images, inits) total_loss = 0 for i, dx in enumerate(dxs): norm_error = mdm_model.normalized_rmse(dx + inits, lms) tf.histogram_summary('errors', norm_error) loss = tf.reduce_mean(norm_error) total_loss += loss summaries.append(tf.scalar_summary('losses/step_{}'.format(i), loss)) # Calculate the gradients for the batch of data grads = opt.compute_gradients(total_loss) summaries.append(tf.scalar_summary('losses/total', total_loss)) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, predictions], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms], [tf.float32]) summary = tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5) summaries.append(tf.histogram_summary('dx', predictions - inits)) summaries.append(summary) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. # NOTE: Currently we are not using batchnorm in MDM. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Build an initialization operation to run below. init = tf.initialize_all_variables() print('Initializing variables...') sess.run(init) print('Initialized variables.') if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) print('Starting training...') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 50 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(scope=''): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) train_dirs = FLAGS.datasets.split(':') # Calculate the learning rate schedule. decay_steps = 15000 # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads _images, _shapes, _reference_shape, pca_model = \ data_provider.load_images(train_dirs) reference_shape = tf.constant(_reference_shape, dtype=tf.float32, name='reference_shape') image_shape = _images[0].shape lms_shape = _shapes[0].points.shape def get_random_sample(rotation_stddev=10): idx = np.random.randint(low=0, high=len(_images)) im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False) lms = _shapes[idx] im.landmarks['PTS'] = lms if np.random.rand() < .5: im = utils.mirror_image(im) if np.random.rand() < .5: theta = np.random.normal(scale=rotation_stddev) rot = menpo.transform.rotate_ccw_about_centre(lms, theta) im = im.warp_to_shape(im.shape, rot) pixels = im.pixels.transpose(1, 2, 0).astype('float32') shape = im.landmarks['PTS'].lms.points.astype('float32') return pixels, shape image, shape = tf.py_func(get_random_sample, [], [tf.float32, tf.float32], stateful=True) initial_shape = data_provider.random_shape(shape, reference_shape, pca_model) image.set_shape(image_shape) shape.set_shape(lms_shape) initial_shape.set_shape(lms_shape) image = data_provider.distort_color(image) images, lms, inits = tf.train.batch([image, shape, initial_shape], FLAGS.batch_size, dynamic_pad=False, capacity=5000, enqueue_many=False, num_threads=num_preprocess_threads, name='batch') print('Defining model...') with tf.device(FLAGS.train_device): # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) predictions, dxs, _ = mdm_model.model(images, inits, patch_shape=(FLAGS.patch_size, FLAGS.patch_size)) total_loss = 0 for i, dx in enumerate(dxs): norm_error = mdm_model.normalized_rmse(dx + inits, lms) tf.histogram_summary('errors', norm_error) loss = tf.reduce_mean(norm_error) total_loss += loss summaries.append(tf.scalar_summary('losses/step_{}'.format(i), loss)) # Calculate the gradients for the batch of data grads = opt.compute_gradients(total_loss) summaries.append(tf.scalar_summary('losses/total', total_loss)) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, predictions], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms], [tf.float32]) summary = tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5) summaries.append(tf.histogram_summary('dx', predictions - inits)) summaries.append(summary) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. # NOTE: Currently we are not using batchnorm in MDM. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Build an initialization operation to run below. init = tf.initialize_all_variables() print('Initializing variables...') sess.run(init) print('Initialized variables.') if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) print('Starting training...') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 20 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 50 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def evaluate(dataset_path): """Evaluate model on Dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): train_dir = Path(FLAGS.checkpoint_dir) reference_shape = mio.import_pickle(train_dir / 'reference_shape.pkl') images, gt_truth, inits, _ = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False) mirrored_images, _, mirrored_inits, shapes = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False, mirror_image=True) print('Loading model...') # Build a Graph that computes the logits predictions from the # inference model. with tf.device(FLAGS.device): patch_shape = (FLAGS.patch_size, FLAGS.patch_size) pred, _, _ = mdm_model.model(images, inits, patch_shape=patch_shape) tf.get_variable_scope().reuse_variables() pred_mirrored, _, _ = mdm_model.model(mirrored_images, mirrored_inits, patch_shape=patch_shape) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, pred], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, gt_truth], [tf.float32]) summaries = [] summaries.append( tf.summary.image('images', tf.concat([gt_images, pred_images], 2), max_outputs=5)) avg_pred = pred + \ tf.py_func(flip_predictions, (pred_mirrored, shapes), (tf.float32, ))[0] avg_pred /= 2. # Calculate predictions. norm_error = mdm_model.normalized_rmse(avg_pred, gt_truth) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( mdm_train.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge(summaries) graph_def = tf.get_default_graph().as_graph_def() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, graph_def=graph_def) while True: _eval_once(saver, summary_writer, norm_error, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)