def testMovingAverageVariables(self): height, width = 3, 3 with self.test_session(): images = tf.random_uniform((5, height, width, 3), seed=1) ops.batch_norm(images, scale=True) moving_mean = tf.moving_average_variables()[0] moving_variance = tf.moving_average_variables()[1] self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
def testCreateVariablesWithoutCenterWithoutScale(self): height, width = 3, 3 with self.test_session(): images = tf.random_uniform((5, height, width, 3), seed=1) ops.batch_norm(images, center=False, scale=False) beta = variables.get_variables_by_name('beta') self.assertEquals(beta, []) gamma = variables.get_variables_by_name('gamma') self.assertEquals(gamma, []) moving_mean = tf.moving_average_variables()[0] moving_variance = tf.moving_average_variables()[1] self.assertEquals(moving_mean.op.name, 'BatchNorm/moving_mean') self.assertEquals(moving_variance.op.name, 'BatchNorm/moving_variance')
def _CheckDecay(self, ema, actual_decay, dim): tens = _Repeat(10.0, dim) thirties = _Repeat(30.0, dim) var0 = tf.Variable(tens, name="v0") var1 = tf.Variable(thirties, name="v1") tf.initialize_all_variables().run() # Note that tensor2 is not a Variable but just a plain Tensor resulting # from the sum operation. tensor2 = var0 + var1 update = ema.apply([var0, var1, tensor2]) avg0 = ema.average(var0) avg1 = ema.average(var1) avg2 = ema.average(tensor2) self.assertItemsEqual([var0, var1], tf.moving_average_variables()) self.assertFalse(avg0 in tf.trainable_variables()) self.assertFalse(avg1 in tf.trainable_variables()) self.assertFalse(avg2 in tf.trainable_variables()) tf.initialize_all_variables().run() self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name) self.assertEqual("v1/ExponentialMovingAverage:0", avg1.name) self.assertEqual("add/ExponentialMovingAverage:0", avg2.name) # Check initial values. self.assertAllClose(tens, var0.eval()) self.assertAllClose(thirties, var1.eval()) self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval()) # Check that averages are initialized correctly. self.assertAllClose(tens, avg0.eval()) self.assertAllClose(thirties, avg1.eval()) # Note that averages of Tensor's initialize to zeros_like since no value # of the Tensor is known because the Op has not been run (yet). self.assertAllClose(_Repeat(0.0, dim), avg2.eval()) # Update the averages and check. update.run() dk = actual_decay expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim) self.assertAllClose(expected, avg0.eval()) expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim) self.assertAllClose(expected, avg1.eval()) expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk), dim) self.assertAllClose(expected, avg2.eval()) # Again, update the averages and check. update.run() expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk), dim) self.assertAllClose(expected, avg0.eval()) expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk), dim) self.assertAllClose(expected, avg1.eval()) expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk + (10.0 + 30.0) * (1 - dk)), dim) self.assertAllClose(expected, avg2.eval())
def get_other_op(global_step): batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Track the moving averages of all trainable variables variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) batchnorm_updates_op = tf.group(*batchnorm_updates) return variables_averages_op, batchnorm_updates_op
def create_init_fn_to_restore(self, master_checkpoint, inception_checkpoint=None): """Creates an init operations to restore weights from various checkpoints. Args: master_checkpoint: path to a checkpoint which contains all weights for the whole model. inception_checkpoint: path to a checkpoint which contains weights for the inception part only. Returns: a function to run initialization ops. """ all_assign_ops = [] all_feed_dict = {} def assign_from_checkpoint(variables, checkpoint): logging.info('Request to re-store %d weights from %s', len(variables), checkpoint) if not variables: logging.error('Can\'t find any variables to restore.') sys.exit(1) assign_op, feed_dict = slim.assign_from_checkpoint(checkpoint, variables) all_assign_ops.append(assign_op) all_feed_dict.update(feed_dict) logging.info('variables_to_restore:\n%s' % utils.variables_to_restore().keys()) logging.info('moving_average_variables:\n%s' % [v.op.name for v in tf.moving_average_variables()]) logging.info('trainable_variables:\n%s' % [v.op.name for v in tf.trainable_variables()]) if master_checkpoint: assign_from_checkpoint(utils.variables_to_restore(), master_checkpoint) if inception_checkpoint: variables = utils.variables_to_restore( 'AttentionOcr_v1/conv_tower_fn/INCE', strip_scope=True) assign_from_checkpoint(variables, inception_checkpoint) def init_assign_fn(sess): logging.info('Restoring checkpoint(s)') sess.run(all_assign_ops, all_feed_dict) return init_assign_fn
def add_train_step(self): with tf.variable_scope('taining'): loss = slim.losses.cross_entropy_loss(self.logits[0], self.ground_truth, label_smoothing=0.1, weight=1.0) loss_auxiliary = slim.losses.cross_entropy_loss(self.logits[1], self.ground_truth, label_smoothing=0.1, weight=0.4, scope='aux_loss') losses = [loss, loss_auxiliary] regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses + regularization_losses, name='total_loss') loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): self.total_loss = tf.identity(total_loss) apply_gradient_op = self.optimizer.minimize(self.total_loss) variable_averages = tf.train.ExponentialMovingAverage(inception.MOVING_AVERAGE_DECAY, num_updates=None) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) batchnorm_updates_op = tf.group(*batchnorm_updates) self.train_step = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op)
def build_graph(self, filenames, labels, subset, feed_hypes=None): hypes = self.hypes.copy() if feed_hypes: with tf.name_scope(None): for i in feed_hypes: hypes[i] = tf.placeholder('float32', name=i) hypes[i].set_shape([]) with tf.name_scope('inputs'): filenames, labels = tf.train.slice_input_producer( tensor_list=[filenames, labels], capacity=hypes['batch_size'] * 2, shuffle=(subset == 'train')) filenames, labels = tf.train.batch(tensor_list=[filenames, labels], capacity=hypes['batch_size'] * 2, batch_size=hypes['batch_size']) images0 = [ tf.image.decode_jpeg(tf.read_file(i[0]), channels=3) for i in tf.split(0, hypes['batch_size'], filenames) ] images0 = [skin.util.square_pad(i) for i in images0] if subset == 'train': images0 = [tf.image.random_flip_left_right(i) for i in images0] images0 = [tf.image.random_flip_up_down(i) for i in images0] if hypes['spatial_transformer']: images = skin.util.spatial_tranform(images0, hypes['batch_size'], subset, hypes['loc_net'], hypes['xform_reg']) else: images = tf.pack( [tf.image.resize_images(i, 299, 299) for i in images0]) with tf.name_scope(None): images = tf.identity(images, name='input') logits, logits_aux = inception_model.inference( images=(images - 128) / 128., num_classes=len(self.labels), for_training=(subset == 'train'), restore_logits=(subset != 'train')) with tf.name_scope(None): logits = tf.identity(logits, name='logits') tf.histogram_summary('logits', logits) with tf.name_scope('loss'): batch_size, num_classes = logits.get_shape().as_list() labels_sparse = tf.sparse_to_dense( sparse_indices=tf.transpose( tf.pack([tf.range(batch_size), labels])), output_shape=[batch_size, num_classes], sparse_values=np.ones(batch_size, dtype='float32')) loss = tf.nn.softmax_cross_entropy_with_logits( logits, labels_sparse) loss = tf.reduce_mean(loss, name='loss') loss_aux = tf.nn.softmax_cross_entropy_with_logits( logits_aux, labels_sparse) loss_aux = tf.reduce_mean(loss_aux, name='loss_aux') loss = 0.7 * loss + 0.3 * loss_aux tf.scalar_summary('loss', loss) fetches = {'loss': loss, 'filenames': filenames, 'logits': logits} def print_graph_ops(): with open('/tmp/graph_ops.txt', 'w') as f: for op in tf.get_default_graph().get_operations(): f.write(op.type.ljust(35) + '\t' + op.name + '\n') if subset == 'train': reg_losses = tf.get_collection('regularization_losses') for i, j in enumerate(reg_losses): if 'loc_net' in j.name: reg_losses[i] *= hypes['loc_net_reg'] reg_loss = tf.add_n(reg_losses) tf.scalar_summary('reg_loss', reg_loss) with tf.variable_scope('reg_loss'): loss += reg_loss print_graph_ops() global_step = tf.Variable(0, name='global_step', trainable=False) opt = eval('tf.train.{}Optimizer'.format('Adam'))( learning_rate=hypes['learning_rate'], epsilon=hypes['epsilon'], beta1=hypes['beta1'], beta2=hypes['beta2']) grads = opt.compute_gradients(loss) apply_grads = opt.apply_gradients(grads, global_step) variable_averages = tf.train.ExponentialMovingAverage( hypes['variable_averages_decay'], global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply( variables_to_average) batchnorm_updates_op = tf.group(*tf.get_collection('_update_ops_')) train_op = tf.group(apply_grads, variables_averages_op, batchnorm_updates_op) for grad, var in grads: tf.histogram_summary(var.op.name, var) try: tf.histogram_summary(var.op.name + '/gradients', grad) except: print var.op.name fetches.update({ 'reg_loss': reg_loss, 'train_op': train_op, 'global_step': global_step }) else: print_graph_ops() return fetches
def inception_model_fn(features, labels, mode, params): """Inception v3 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['input_perm']) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits, end_points = inception.inception_v3( features, num_classes, is_training=is_training) logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits, end_points = inception.inception_v3( features, num_classes, is_training=is_training) return logits, end_points if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with arg_scope( inception.inception_v3_arg_scope( weight_decay=0.0, batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = build_network() else: with arg_scope( inception.inception_v3_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = build_network() predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) losses = tf.add_n(tf.losses.get_losses()) l2_loss = [] for v in tf.trainable_variables(): if 'BatchNorm' not in v.name and 'weights' in v.name: l2_loss.append(tf.nn.l2_loss(v)) loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 if FLAGS.use_learning_rate_warmup: # Adjust initial learning rate to match final warmup rate warmup_decay = FLAGS.learning_rate_decay**( (FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs) adj_initial_learning_rate = initial_learning_rate * warmup_decay final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int(FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True) if FLAGS.use_learning_rate_warmup: wlr = 0.1 * adj_initial_learning_rate wlr_height = tf.cast( 0.9 * adj_initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1), tf.float32) epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32) exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs + FLAGS.learning_rate_decay_epochs) lin_inc_lr = tf.add( wlr, tf.multiply( tf.cast(tf.subtract(current_epoch, epoch_offset), tf.float32), wlr_height)) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where(tf.greater_equal(current_epoch, exp_decay_start), learning_rate, lin_inc_lr)), wlr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide them as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are infered from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) # Ops are assigned to worker by default. with tf.device('/job:worker/task:%d' % FLAGS.task_id): # Variables and its related init/assign ops are assigned to ps. with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser(num_parameter_servers)): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. global_step = slim.variables.global_step() # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. tf.scalar_summary('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size=FLAGS.batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 logits = inception.inference(images, num_classes, for_training=True) # Add classification loss. inception.loss(logits, labels) # Gather all of the losses including regularization losses. losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. for l in losses + [total_loss]: loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. tf.scalar_summary(loss_name + ' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. for var in variables_to_average: tf.histogram_summary(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, replica_id=FLAGS.task_id, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) # Add dependency to compute batchnorm_updates. with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners, init_tokens and clean_up_op, which is used to # synchronize replicas. # More details can be found in sync_replicas_optimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() clean_up_op = opt.get_clean_up_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init_op = tf.initialize_all_variables() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. next_summary_time = time.time() + FLAGS.save_summaries_secs while not sv.should_stop(): try: start_time = time.time() loss_value, step = sess.run([train_op, global_step]) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time if step % 30 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. if is_chief and next_summary_time < time.time(): tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('About to execute sync_clean_up_op!') sess.run(clean_up_op) raise # Stop the supervisor. This also waits for service threads to finish. sv.stop() # Save after the training ends. if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def main(argv=None): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are: %s' % ps_hosts) tf.logging.info('Worker hosts are: %s' % worker_hosts) cluster_spec = tf.train.ClusterSpec({ 'ps': ps_hosts, 'worker': worker_hosts }) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id, protocol=FLAGS.protocol) sspManager = SspManager(len(worker_hosts), 5) if FLAGS.job_name == 'ps': if FLAGS.task_id == 0: rpcServer = sspManager.create_rpc_server(ps_hosts[0].split(':')[0]) rpcServer.serve() server.join() time.sleep(5) rpcClient = sspManager.create_rpc_client(ps_hosts[0].split(':')[0]) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() is_chief = (FLAGS.task_id == 0) if is_chief: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser( num_parameter_servers)): '''Prepare Input''' global_step = slim.variables.global_step() batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) num_classes = dataset.num_classes() + 1 '''Inference''' logits = inception.inference(images, num_classes, for_training=True) '''Loss''' inception.loss(logits, labels, batch_size) losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) '''Optimizer''' exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_workers) lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) '''Train Operation''' batchnorm_updates = tf.get_collection( slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) naive_grads = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads] apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') '''Supervisor and Session''' saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) '''Start Training''' sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() loss_value, gs = sess.run( [train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' duration = time.time() - start_time examples_per_sec = batch_size_num / float(duration) sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (gs %d), loss= %.2f (%.1f samples/s; %.3f s/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) rpcClient.check_staleness(FLAGS.task_id, step)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possiblility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables()
def model_fn(features, labels, mode, params): """Mobilenet v1 model using Estimator API.""" num_classes = params['num_classes'] training_active = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = supervised_images.tensor_transform_fn(features, params['input_perm']) if params['clear_update_collections']: # updates_collections must be set to None in order to use fused batchnorm with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=params['depth_multiplier']) else: with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=params['depth_multiplier']) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not params['use_tpu']): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=params['eval_batch_size'], message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=params['eval_batch_size'], message='label: ') one_hot_labels = tf.one_hot(labels, params['num_classes'], dtype=tf.int32) tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = params['learning_rate'] * params['train_batch_size'] / 256 # pylint: disable=line-too-long final_learning_rate = 0.0001 * initial_learning_rate train_op = None if training_active: batches_per_epoch = params['num_train_images'] // params[ 'train_batch_size'] global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=params['learning_rate_decay_epochs'] * batches_per_epoch, decay_rate=params['learning_rate_decay'], staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if params['optimizer'] == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif params['optimizer'] == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif params['optimizer'] == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', params['optimizer']) if params['use_tpu']: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if params['moving_average']: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if eval_active: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy( labels, tf.argmax(input=predictions, axis=1)) return {'accuracy': accuracy} if params['use_logits']: eval_predictions = logits else: eval_predictions = end_points['Predictions'] eval_metrics = (metric_fn, [labels, eval_predictions]) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def resnet_model_fn(features, labels, mode, params): """Returns the model function.""" global_step = tf.train.get_global_step() feature = features['feature'] labels = labels['label'] one_hot_labels = model_utils.get_label(labels, params, bird_num_classes, batch_size=params['batch_size']) def get_logits(): """Return the logits.""" end_points, aux_logits = None, None if FLAGS.model_type == 'resnet': avg_pool = model.resnet_v1_model(feature, labels, mode, params) else: assert False name = 'final_dense_dst' with tf.variable_scope('target_CLS'): logits = tf.layers.dense( inputs=avg_pool, units=bird_num_classes, kernel_initializer=tf.random_normal_initializer( stddev=.01), name=name) if end_points is not None: aux_pool = end_points['AuxLogits_Pool'] aux_logits = tf.layers.dense( inputs=aux_pool, units=bird_num_classes, kernel_initializer=tf.random_normal_initializer( stddev=.001), name='Aux{}'.format(name)) return logits, aux_logits, end_points logits, _, _ = get_logits() logits = tf.cast(logits, tf.float32) if FLAGS.model_type == 'resnet': dst_loss = tf.losses.softmax_cross_entropy( logits=logits, weights=1., onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) dst_l2_loss = FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) loss = dst_loss + dst_l2_loss train_op = None if mode == tf.estimator.ModeKeys.TRAIN: cur_finetune_step = tf.train.get_global_step() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if FLAGS.model_type == 'resnet': finetune_learning_rate = rampcosine() else: finetune_learning_rate = rampcosine() if FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate=finetune_learning_rate, momentum=params['momentum'], use_nesterov=True) elif FLAGS.optimizer == 'RMS': optimizer = tf.train.RMSPropOptimizer( finetune_learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(finetune_learning_rate) optimizer = tf.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=FLAGS.sync_replicas, total_num_replicas=run_config.num_worker_replicas) train_op = tf.contrib.training.create_train_op(loss, optimizer) with tf.variable_scope('finetune'): train_op = optimizer.minimize(loss, cur_finetune_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage( decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]): with tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) else: train_op = None batch_size = params['batch_size'] # pylint: disable=unused-variable eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = model_utils.metric_fn(labels, logits) if mode == tf.estimator.ModeKeys.TRAIN: with tf.control_dependencies([train_op]): tf.summary.scalar('classifier/finetune_loss', loss) tf.summary.scalar('classifier/finetune_lr', finetune_learning_rate) else: train_op = None return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics, )
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print ('PS hosts are: %s' % ps_hosts) print ('Worker hosts are: %s' % worker_hosts) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name = FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) """Train CIFAR-10 for a number of steps.""" cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) device_setter = tf.train.replica_device_setter(cluster=cluster) with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary('learning_rate', lr) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), replica_id=FLAGS.task_id, total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # Compute gradients with respect to the loss. grads = opt.compute_gradients(loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.initialize_all_variables(), summary_op=tf.merge_all_summaries(), global_step=global_step, saver=saver, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) print ("Before session init") # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print ("Before session init done") # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) print ('Started %d queues for processing input data.' % len(queue_runners)) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) print ('Start training') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, gs = sess.run([train_op, loss, global_step]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def test_restore_ema(self): # Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3 x_data = np.random.rand(100).astype(np.float32) y_data = x_data * 0.1 + 0.3 # Try to find values for W and b that compute y_data = W * x_data + b # (We know that W should be 0.1 and b 0.3, but TensorFlow will # figure that out for us.) W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W') b = tf.Variable(tf.zeros([1]), name='b') y = W * x_data + b # Minimize the mean squared errors. loss = tf.reduce_mean(tf.square(y - y_data)) optimizer = tf.train.GradientDescentOptimizer(0.5) opt_op = optimizer.minimize(loss) # Track the moving averages of all trainable variables. ema = tf.train.ExponentialMovingAverage(decay=0.9999) averages_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([opt_op]): train_op = tf.group(averages_op) # Before starting, initialize the variables. We will 'run' this first. init = tf.global_variables_initializer() saver = tf.train.Saver(tf.trainable_variables()) # Launch the graph. sess = tf.Session() sess.run(init) # Fit the line. for _ in range(201): sess.run(train_op) w_reference = sess.run('W/ExponentialMovingAverage:0') b_reference = sess.run('b/ExponentialMovingAverage:0') saver.save(sess, os.path.join(self.tmp_dir, "model_ex1")) tf.reset_default_graph() tf.train.import_meta_graph(os.path.join(self.tmp_dir, "model_ex1.meta")) sess = tf.Session() print('------------------------------------------------------') for var in tf.global_variables(): print('all variables: ' + var.op.name) for var in tf.trainable_variables(): print('normal variable: ' + var.op.name) for var in tf.moving_average_variables(): print('ema variable: ' + var.op.name) print('------------------------------------------------------') mode = 1 restore_vars = {} if mode == 0: ema = tf.train.ExponentialMovingAverage(1.0) for var in tf.trainable_variables(): print('%s: %s' % (ema.average_name(var), var.op.name)) restore_vars[ema.average_name(var)] = var elif mode == 1: for var in tf.trainable_variables(): ema_name = var.op.name + '/ExponentialMovingAverage' print('%s: %s' % (ema_name, var.op.name)) restore_vars[ema_name] = var saver = tf.train.Saver(restore_vars, name='ema_restore') saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1")) w_restored = sess.run('W:0') b_restored = sess.run('b:0') self.assertAlmostEqual( w_reference, w_restored, 'Restored model modes not use the EMA filtered weight') self.assertAlmostEqual( b_reference, b_restored, 'Restored model modes not use the EMA filtered bias')
def train(*args, **kwargs): # Get all neccessary paramters from kwargs try: # Get model graph my_model_graph = kwargs['model_graph'] except: logging.error('(model_graph) was not provided!') raise KeyError('(model_graph) was not provided!') try: # Get loss operations my_loss = kwargs['loss'] except: logging.error('(losses) was not provided!') raise KeyError('(losses) was not provided!') try: # Get metric operations my_metric_ops = kwargs['metrics'] except: my_metric_ops = None pass # Build the summary operation based on the TF collection of Summaries. if not kwargs['output_dir'] or 'output_dir' not in kwargs: kwargs['output_dir'] = 'output_dir/train_dir/%s' % datetime.now( ).strftime('%Y_%m_%d_%H.%M') logging.info('Saving evaluation results to: {}'.format( kwargs['output_dir'])) # Add train iterator train_iter = kwargs['train_iter'] train_iter.initialize() train_iter.load_img_lst() train_data = train_iter.data_batch() train_label = train_iter.label_batch() # Add validation iterator valid_iter = kwargs['valid_iter'] valid_iter.initialize() valid_iter.load_img_lst() valid_data = valid_iter.data_batch() valid_label = valid_iter.label_batch() # Define global step global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) test_image_to_log = tf.placeholder( tf.uint8, [40, kwargs['image_shape'][-3], kwargs['image_shape'][-2], 3]) log_image_test = tf.summary.image("Test examples", test_image_to_log, max_outputs=40) train_image_to_log = tf.placeholder( tf.uint8, [40, kwargs['image_shape'][-3], kwargs['image_shape'][-2], 3]) log_image_train = tf.summary.image("Train examples", train_image_to_log, max_outputs=40) # Selecte optimizer lr = kwargs['learning_rate'] if kwargs['optimizer'] == 'GradientDescentOptimizer': opt = tf.train.GradientDescentOptimizer(kwargs['learning_rate']) elif kwargs['optimizer'] == 'MomentumOptimizer': opt = tf.train.MomentumOptimizer(kwargs['learning_rate'], kwargs['momentum']) elif kwargs['optimizer'] == 'AdamOptimizer': opt = tf.train.AdamOptimizer(kwargs['learning_rate']) elif kwargs['optimizer'] == 'AdadeltaOptimizer': opt = tf.train.AdadeltaOptimizer(kwargs['learning_rate'], kwargs['rho']) elif kwargs['optimizer'] == 'RMSPropOptimizer': decay_steps = int(kwargs['tr_num_examples'] / kwargs['batch_size'] * kwargs['num_epochs_per_decay']) lr = tf.train.exponential_decay(kwargs['learning_rate'], global_step, decay_steps, kwargs['learning_rate_decay_factor'], staircase=True) opt = tf.train.RMSPropOptimizer(lr, kwargs['RMSPROP_DECAY'], momentum=kwargs['momentum'], epsilon=kwargs['RMSPROP_EPSILON']) else: logging.error('Hyperparameter "optimizer" was not provided!') raise KeyError('Hyperparameter "optimizer" was not provided!') logging.info('Selected Optimizer: {}'.format(kwargs['optimizer'])) gpu_id = kwargs['gpus'] if not isinstance(gpu_id, list): gpu_id = [gpu_id] gpu_id = gpu_id[0] with tf.device('/gpu:%d' % gpu_id): logging.info('Training on gpu:{}'.format(gpu_id)) # Get endpoint / or get tensor from session.graph out_, train_eps_ = my_model_graph(train_data, restore_logits=False, is_training=True, reuse=None, scope=kwargs['model_name'], **kwargs) # Add loss operation loss_op = my_loss(out_, train_label, 'train', **kwargs) # train_metric_ops = tf.group(*[m(out_, train_label, 'train', **kwargs) # for m in my_metric_ops]) # Add loss-averages for training tr_loss_averages_op = metrics.add_loss_averages( tf.get_collection('train'), 'train_summaries') # Add learning rate to summary train_summaries = [tf.summary.scalar('learning_rate', lr)] train_summaries += tf.get_collection('train_summaries') # Calculate and apply selected gradients if kwargs['train_scopes']: ws = [] # Find all parameters in the train scopes for tr_scope in kwargs['train_scopes']: logging.info('Add to training endpoints: {}'.format(tr_scope)) with tf.variable_scope(tr_scope, reuse=True) as scope: w_names = [ '/'.join(i.name.split('/')[1:])[:-2] for i in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) ] ws += [tf.get_variable(w_name) for w_name in w_names] for w_name in w_names: logging.info('({})-paramter: {}'.format( tr_scope, w_name)) # Compute gradients for this selected parameters grads = opt.compute_gradients(loss_op, ws) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) else: # Update all parameters logging.info('Adding all parameters to training endpoints') grads = opt.compute_gradients(loss_op) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Get batchnorm moving mean and variance updates if 'UPDATE_OPS_COLLECTION' in kwargs: logging.debug('add batchnorm updates') batchnorm_updates = tf.get_collection( kwargs['UPDATE_OPS_COLLECTION']) batchnorm_updates_op = tf.group(*batchnorm_updates) # Add histograms for gradients. #for grad, var in grads: # if grad is not None: # train_summaries.append( # tf.histogram_summary(var.op.name + '/gradients', grad)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( kwargs['MOVING_AVERAGE_DECAY'], global_step) # Update moving averages of all parameters variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates if 'UPDATE_OPS_COLLECTION' in kwargs: logging.debug('batchnorm updates in train_op') train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) else: logging.debug('no batchnorm updates in train_op') train_op = tf.group(apply_gradient_op, variables_averages_op) # Add evaluation graph after training step test_out_, _ = my_model_graph(valid_data, restore_logits=False, is_training=False, reuse=True, scope=kwargs['model_name'], **kwargs) # Add validation metrics and averages test_loss_op = my_loss(test_out_, valid_label, 'validation', **kwargs) # test_metric_ops = tf.group(*([m(test_out_, valid_label, 'validation', **kwargs) # for m in my_metric_ops])) if my_metric_ops != None: test_metric_ops_list = my_metric_ops(test_out_, valid_label, 'validation', **kwargs) # Add loss-averages for validation va_loss_averages_op = metrics.add_loss_averages( tf.get_collection('validation'), 'validation_summaries') validation_summaries = tf.get_collection('validation_summaries') # Build summary operation train_summary_op = tf.summary.merge(train_summaries) validation_summary_op = tf.summary.merge(validation_summaries) # summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init_op = tf.initialize_all_variables() gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=kwargs['gpu_fraction']) # Define a session sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=kwargs['log_device_placement'])) # initialize all variables sess.run(init_op) # restore checkpoint and create saver if kwargs['pretrained_checkpoint_dir']: ckpt = tf.train.get_checkpoint_state( kwargs['pretrained_checkpoint_dir']) ignore_missing_vars = True print( '----------------\nrestoring checkpoint: {} ignore_missing_vars={}' .format(ckpt.model_checkpoint_path, ignore_missing_vars)) init_fn, _ = restore_checkpoint( sess, ckpt.model_checkpoint_path, var_list=tf.all_variables(), ignore_missing_vars=ignore_missing_vars, reshape_variables=False) init_fn(sess) print( 'checkpoint restored: {} ignoring missing vars={}\n------------------------' .format(ckpt.model_checkpoint_path, ignore_missing_vars)) # else: saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) # Start the queue runners. coord = None if train_iter.need_queue_runners() or valid_iter.need_queue_runners(): logging.debug('Create coordinator, start queue runners...') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(kwargs['output_dir'], graph=sess.graph) # validation and testloss placeholder, manual rate, not every batch not # every epoch tr_summary_op_train, tr_loss_placeholder = create_summary_op( loss_op.name.split(':')[0]) # loss_op.name if my_metric_ops == None: va_summary_op_train, va_loss_placeholder = create_summary_op( test_loss_op.name.split(':')[0]) # test_loss_op.name else: va_summary_op_train_list, va_loss_placeholder_list = [], [] for cnt, m in enumerate(test_metric_ops_list): if cnt == 10: va_summary_op_train, va_loss_placeholder = create_summary_op( 'va/' + test_loss_op.name.split(':')[0] + '_average_' + str(cnt)) # test_loss_op.name else: va_summary_op_train, va_loss_placeholder = create_summary_op( 'va/' + test_loss_op.name.split(':')[0] + '_channelid_' + str(cnt)) # test_loss_op.name va_summary_op_train_list.append(va_summary_op_train) va_loss_placeholder_list.append(va_loss_placeholder) for step in range(kwargs['max_steps']): epoch_start = time.time() # Training step if step % 1 == 0: num_iter = int( math.ceil( float(kwargs['tr_num_examples']) / kwargs['batch_size'])) train_step = 0 train_loss = [] start_time = time.time() while train_step < num_iter and not should_stop(coord): train_iter.read_batch() train_data_feed = train_iter.get_data_batch() train_label_feed = train_iter.get_label_batch() if train_data_feed != None and train_label_feed != None: # Merge data and label dicts train_data.update(train_label) train_data_feed.update(train_label_feed) data_keys = train_data.keys() data_feed_keys = train_data_feed.keys() assert data_keys == data_feed_keys train_loss_, _ = sess.run([loss_op, train_op], feed_dict={ train_data[k]: train_data_feed[k] for k in data_keys }) ''' print(train_data_feed['labels'].shape) print(tmp_out['predictions']) print(tmp_out['predictions'].shape) print(train_loss_) print(train_loss_.shape) sys.exit() ''' else: train_loss_, _ = sess.run([loss_op, train_op]) assert not np.isnan( train_loss_), 'Model diverged with training-loss = NaN' train_loss += [train_loss_] train_step += 1 mean_loss_tr = np.mean(train_loss) summary_str = sess.run( tr_summary_op_train, feed_dict={tr_loss_placeholder: mean_loss_tr}) summary_writer.add_summary(summary_str, (step * kwargs['tr_num_examples'])) duration = time.time() - start_time examples_per_sec = kwargs['batch_size'] / float(duration) format_str = ('Epoch %d, tr_loss = %.5f (%.1f examples/sec; %.3f ' 'sec/epoch)') logging.info(format_str % (step, mean_loss_tr, examples_per_sec, duration)) # Evaluation step evaluation_step = 1 if 'evaluation_step' not in kwargs else kwargs[ 'evaluation_step'] if step % evaluation_step == 0: # sess.run(batchnorm_updates_op) num_iter = int( math.ceil( float(kwargs['va_num_examples']) / kwargs['batch_size'])) test_step = 0 test_loss = [] start_time = time.time() while test_step < num_iter and not should_stop(coord): valid_iter.read_batch() valid_data_feed = valid_iter.get_data_batch() valid_label_feed = valid_iter.get_label_batch() if valid_data_feed != None and valid_label_feed != None: # Merge data and label dicts valid_data.update(valid_label) valid_data_feed.update(valid_label_feed) data_keys = valid_data.keys() data_feed_keys = valid_data_feed.keys() assert data_keys == data_feed_keys if my_metric_ops == None: test_loss_ = sess.run([test_loss_op], feed_dict={ valid_data[k]: valid_data_feed[k] for k in data_keys }) assert not np.isnan( test_loss_ ), 'Model diverged with validation-loss = NaN' else: test_loss_ = sess.run(test_metric_ops_list, feed_dict={ valid_data[k]: valid_data_feed[k] for k in data_keys }) else: if my_metric_ops == None: test_loss_ = sess.run([test_loss_op]) assert not np.isnan( test_loss_ ), 'Model diverged with validation-loss = NaN' else: test_loss_ = sess.run(test_metric_ops_list) test_loss += [test_loss_] test_step += 1 if my_metric_ops == None: mean_loss_va = np.mean(test_loss) summary_str = sess.run( va_summary_op_train, feed_dict={va_loss_placeholder: mean_loss_va}) summary_writer.add_summary(summary_str, (step * kwargs['tr_num_examples'])) duration = time.time() - start_time examples_per_sec = kwargs['batch_size'] / float(duration) format_test_str = ( 'Epoch %d, va_loss = %.5f (%.1f examples/sec, %.3f ' 'sec/epoch)') logging.info(format_test_str % (step, mean_loss_va, examples_per_sec, duration)) else: test_loss = np.array(test_loss) test_loss = np.mean(test_loss, axis=0) for cnt, l in enumerate(test_loss): summary_str = sess.run( va_summary_op_train_list[cnt], feed_dict={va_loss_placeholder_list[cnt]: l}) summary_writer.add_summary( summary_str, (step * kwargs['tr_num_examples'])) duration = time.time() - start_time examples_per_sec = kwargs['batch_size'] / float(duration) format_test_str = ( 'Epoch %d, va_loss_total = %.5f (%.1f examples/sec, %.3f ' 'sec/epoch)') logging.info(format_test_str % (step, test_loss[0], examples_per_sec, duration)) # IMAGE SUNMMARY STUFF summary_step = 1 if 'summary_step' not in kwargs else kwargs[ 'summary_step'] if step % summary_step == 0: logging.debug('Add summary string...') if train_data_feed != None and train_label_feed != None: # Run all output-opterations and summary ops out_.update({'train_summary_op': train_summary_op}) out = sess.run(out_, feed_dict={ train_data[k]: train_data_feed[k] for k in data_keys }) # summary_str = out['train_summary_op'] # Add image summaries if 'train_image_summary' in kwargs: out.update(train_data_feed) out.update({'step': step, 'mode': 'train'}) img_logs = kwargs['train_image_summary'](kwargs, **out) list_of_log_images = [] for train_output_to_log, name in img_logs: list_of_log_images.append(train_output_to_log) feed = {train_image_to_log: np.array(list_of_log_images)} train_image_summary_str = sess.run(log_image_train, feed_dict=feed) summary_writer.add_summary(train_image_summary_str) else: # print("should not happen") # sys.exit() # Run all output operations and summary ops out_.update({'train_summary_op': train_summary_op}) # Add input data and labels to this run out_.update(train_data) out_.update(train_label) out = sess.run(out_) if 'train_image_summary' in kwargs: # Add current step and mode to image summary fuction input out.update({'step': step, 'mode': 'train'}) img_logs = kwargs['train_image_summary'](kwargs, **out) list_of_log_images = [] for train_output_to_log, name in img_logs: list_of_log_images.append(train_output_to_log) feed = {train_image_to_log: np.array(list_of_log_images)} train_image_summary_str = sess.run(log_image_train, feed_dict=feed) summary_writer.add_summary(train_image_summary_str) ''' for train_output_to_log, name in img_logs: feed = { test_image_to_log: train_output_to_log, log_image_name: name} train_image_summary_str = sess.run( log_image, feed_dict=feed) summary_writer.add_summary(train_image_summary_str) ''' if valid_data_feed != None and valid_label_feed != None: test_out_.update( {'validation_summary_op': validation_summary_op}) out = sess.run(test_out_, feed_dict={ valid_data[k]: valid_data_feed[k] for k in data_keys }) # Add image summaries if 'validation_image_summary' in kwargs: out.update(valid_data_feed) out.update({'step': step, 'mode': 'validation'}) img_logs = kwargs['validation_image_summary'](kwargs, **out) list_of_log_images = [] for test_output_to_log, name in img_logs: list_of_log_images.append(test_output_to_log) feed = {test_image_to_log: np.array(list_of_log_images)} test_image_summary_str = sess.run(log_image_test, feed_dict=feed) summary_writer.add_summary(test_image_summary_str) else: # print("should not happen") # sys.exit() # Run all output operations and the summary ops test_out_.update( {'validation_summary_op': validation_summary_op}) # Add input data and labels to summary run test_out_.update(valid_data) test_out_.update(valid_label) out = sess.run(test_out_) if 'validation_image_summary' in kwargs: out.update({'step': step, 'mode': 'validation'}) img_logs = kwargs['validation_image_summary'](kwargs, **out) list_of_log_images = [] for test_output_to_log, name in img_logs: list_of_log_images.append(test_output_to_log) feed = {test_image_to_log: np.array(list_of_log_images)} test_image_summary_str = sess.run(log_image_test, feed_dict=feed) summary_writer.add_summary(test_image_summary_str) # Save the model checkpoint periodically. save_step = 1 if 'save_step' not in kwargs else kwargs['save_step'] if (step % save_step == 0 or (step + 1) == kwargs['max_steps']) and step != 0: logging.info('Saving checkpoint to: {}, step: {}'.format( kwargs['output_dir'], step)) checkpoint_path = os.path.join(kwargs['output_dir'], 'new-model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step) logging.info('Time per Epoch: {}'.format(time.time() - epoch_start)) # Join threads and close session if train_iter.need_queue_runners() or valid_iter.need_queue_runners(): logging.debug('request coordinater stop, joining threads...') coord.request_stop() coord.join(threads) sess.close()
def batch_normalization( x, training, name="batch_normalization", decay=0.99, epsilon=1e-5, global_norm=True): # Get input shape as python list. shape = x.get_shape().as_list() if global_norm: # Channel-wise statistics. size = shape[-1:] axes = list(range(len(shape)-1)) keep_dims = False else: # Pixel-wise statistics. size = [1] + shape[1:] axes = [0] keep_dims = True with tf.variable_scope(name): beta = tf.get_variable( name="beta", shape=size, initializer=tf.constant_initializer(0.0), ) gamma = tf.get_variable( name="gamma", shape=size, initializer=tf.random_normal_initializer(1.0, 0.02), ) moving_mean = tf.get_variable( name="moving_mean", shape=size, initializer=tf.constant_initializer(0.0), trainable=False, ) moving_var = tf.get_variable( name="moving_var", shape=size, initializer=tf.constant_initializer(1.0), trainable=False, ) # Add moving vars to the tf collection. # The list of moving vars can be obtained with # tf.moving_average_variables(). if moving_mean not in tf.moving_average_variables(): collection = tf.GraphKeys.MOVING_AVERAGE_VARIABLES tf.add_to_collection(collection, moving_mean) tf.add_to_collection(collection, moving_var) def train_mode(): # execute at training time batch_mean, batch_var = tf.nn.moments( x, axes=axes, keep_dims=keep_dims, ) update_mean = tf.assign_sub( moving_mean, (1-decay) * (moving_mean-batch_mean) ) update_var = tf.assign_sub( moving_var, (1-decay) * (moving_var-batch_var) ) # Automatically update global means and variances. with tf.control_dependencies([update_mean, update_var]): return tf.nn.batch_normalization( x, batch_mean, batch_var, beta, gamma, epsilon) def test_mode(): # execute at test time return tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) return tf.cond(training, train_mode, test_mode)
def main(): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = 100 num_epochs_per_decay = 5 decay_steps = int(num_batches_per_epoch * num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Set the number of preprocessing threads num_preprocess_threads = FLAGS.num_preprocess_threads with h5py.File("../data/training_celeba_FaceTracker.h5") as hf: _images_train = hf["training_data"][:10] _landmarks_train = hf["training_landmarks"][:10] mean_landmarks = hf["mean_landmarks"][:] with h5py.File("../data/validation_celeba_FaceTracker.h5") as hf: _images_val = hf["validation_data"][:10] _landmarks_val = hf["validation_landmarks"][:10] # Load the mean vector and std of (true_landmark - perturbed_landmark) try: delta_mean = np.load("../data/delta_mean.npy") delta_std = np.load("../data/delta_std.npy") except: delta_mean, delta_std = get_perturbation_statistics() image_shape = _images_train[0].shape lms_shape = _landmarks_train[0].shape def get_random_sample(): idx = np.random.randint(0, len(_images_train)) shape = _landmarks_train[idx].astype("float32") initial_shape = sample_perturbation(shape, mean_landmarks).astype("float32") # plt.imshow(_images_train[idx][:, :, 0], cmap="gray") # plt.scatter(shape[:, 0], shape[:, 1], c="g") # plt.scatter(initial_shape[:, 0], initial_shape[:, 1], c="r") # plt.show() # plt.clf() # plt.close() return _images_train[idx].astype("float32"), shape, initial_shape image, shape, initial_shape = tf.py_func(get_random_sample, [], [tf.float32, tf.float32, tf.float32], name="random_sample_train") image.set_shape(image_shape) shape.set_shape(lms_shape) initial_shape.set_shape(lms_shape) images, lms, inits = tf.train.batch([image, shape, initial_shape], FLAGS.batch_size, dynamic_pad=False, capacity=1000, enqueue_many=False, num_threads=num_preprocess_threads, name='train_img_batch') def get_random_sample_val(): idx = np.random.randint(0, len(_images_val)) shape = _landmarks_val[idx].astype("float32") initial_shape = sample_perturbation(shape, mean_landmarks).astype("float32") return _images_val[idx].astype("float32"), shape, initial_shape image_val, shape_val, initial_shape_val = tf.py_func(get_random_sample_val, [], [tf.float32, tf.float32, tf.float32], name="random_sample_val") image_val.set_shape(image_shape) shape_val.set_shape(lms_shape) initial_shape_val.set_shape(lms_shape) images_val, lms_val, inits_val = tf.train.batch([image_val, shape_val, initial_shape_val], FLAGS.batch_size, dynamic_pad=False, capacity=1000, enqueue_many=False, num_threads=num_preprocess_threads, name='val_img_batch') print('Defining model...') with tf.device(FLAGS.train_device): # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, "") with tf.variable_scope("scopernn") as scopernn: predictions, dxs, _ = models.model(images, inits, is_training=True) scopernn.reuse_variables() predictions_val, dxs_val, _ = models.model(images_val, inits_val, is_training=False) total_loss_train = 0 total_loss_val = 0 list_train_loss, list_val_loss = [], [] loss_weights = [1, 1, 1, 1] with tf.name_scope("Error_train"): for i, dx in enumerate(dxs): loss_norm, loss = models.normalized_rmse(inits, dx, lms, delta_mean, delta_std) tf.histogram_summary('errors', loss) list_train_loss.append(loss) total_loss_train += loss_norm * loss_weights[i] summaries.append(tf.scalar_summary('losses_train/step_{}'.format(i), loss)) with tf.name_scope("Error_val"): for i, dx in enumerate(dxs_val): loss_norm_val, loss_val = models.normalized_rmse(inits_val, dx, lms_val, delta_mean, delta_std) tf.histogram_summary('errors', loss_val) list_val_loss.append(loss_val) total_loss_val += loss_norm_val * loss_weights[i] summaries.append(tf.scalar_summary('losses_val/step_{}'.format(i), loss_val)) # Calculate the gradients for the batch of data grads = opt.compute_gradients(total_loss_train) summaries.append(tf.scalar_summary('losses/total_train', total_loss_train)) summaries.append(tf.scalar_summary('losses/total_val', total_loss_val)) gt_images_val, = tf.py_func(utils.batch_draw_landmarks_green, [images_val, lms_val], [tf.float32], name="gt_img_visu") init_images_val, = tf.py_func(utils.batch_draw_landmarks_red, [images_val, inits_val], [tf.float32], name="init_img_visu") pred_images_val, = tf.py_func(utils.batch_draw_landmarks_green, [images_val, predictions_val], [tf.float32], name="pred_img_visu") summary = tf.image_summary('images_val', tf.concat(2, [gt_images_val, init_images_val, pred_images_val]), max_images=8) summaries.append(tf.histogram_summary('dx_train', predictions - inits)) summaries.append(tf.histogram_summary('dx_val', predictions_val - inits_val)) summaries.append(summary) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, "") # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. # NOTE: Currently we are not using batchnorm in MDM. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Build an initialization operation to run below. init = tf.initialize_all_variables() print('Initializing variables...') sess.run(init) print('Initialized variables.') if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) ################# # APP ################# cap = cv2.VideoCapture(0) mode = 0 # detect shape = [] init_shape = [] print '\n\nPRESS q/Q to QUIT\n' while True: # Capture frame-by-frame ret, frame = cap.read() if ret is True: gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # detect face with Haar cascade if mode == 0: faces = face_cascade.detectMultiScale(gray, 1.1, 3) if len(faces) == 0: continue # face detection succesfull. Start Tracking! if mode == 0: init_shape = compute_init_shape(mean_landmarks, faces) else: # Need to realign init_shape with mean shape a,b,tx,ty = utils.CalcSimT(np.ravel(mean_landmarks, order='F'), init_shape.ravel('F')) init_shape = utils.SimT(np.ravel(mean_landmarks, order='F'), a, b, tx, ty) init_shape = np.reshape(init_shape, (5, 2), order='F') leyex, leyey = init_shape[0] reyex, reyey = init_shape[1] ax, bx = 44. / (reyex - leyex), 44. * (1 - leyex / (reyex - leyex)) ay, by = 44. / (reyex - leyex), 44. * (1 - leyey / (reyex - leyex)) # # # # Format image to 128 x 128 and rescale the init shape gray_cropped, init_cropped = format_img(gray, init_shape.copy()) gcc = gray_cropped.copy() gray_cropped = gray_cropped.reshape((1, 128, 128, 1)).astype(np.float32) init_cropped = init_cropped.reshape((1, 5, 2)) # import matplotlib.pylab as plt # # # img = cv2.imread("000091.jpg", 0) # # # img = img[40:170, 40:150] # # # img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_AREA) # # # gray_cropped = img.reshape((1, 128, 128, 1)) / 255. # bla = _images_train[4].reshape((1, 128, 128, 1)) # plt.imshow(bla[0, :, :, 0], cmap="gray") # plt.scatter(init_cropped[0, :, 0], init_cropped[0, :, 1]) # plt.scatter(_landmarks_train[4, :, 0], _landmarks_train[4, :, 1], color="green", s=40) # preds = sess.run(predictions_val, feed_dict={images_val:bla, # inits_val:init_cropped}) # plt.scatter(preds[0, :, 0], preds[0, :, 1], color="red") # plt.show() # raw_input() preds = sess.run(predictions_val, feed_dict={images_val:gray_cropped, inits_val:init_cropped}) preds = preds[0] # # # Convert preds to the big image scale preds[:,0] = (preds[:, 0] - bx) / (ax) preds[:,1] = (preds[:, 1] - by) / (ay) mode = 1 # if len(faces) != 0: # for (x, y, w, h) in faces: # cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2) # if init_shape.shape[0] > 0: # for k in range(5): # cv2.circle(frame,(int(init_shape[k,0]),int(init_shape[k,1])),2,(0,0,255),-1) # if init_cropped != []: # for k in range(5): # cv2.circle(gcc,(int(init_cropped[0][k,0]),int(init_cropped[0][k,1])),2,(0,0,255),-1) # # cv2.circle(gcc,(int(preds[k,0]),int(preds[k,1])),2,(0,0,255),-1) # if len(faces) == 0: # return shape, head_pose, score, mode, track_time if mode != 0: generate_overlay(frame, preds, init_shape) cv2.imshow('Face Tracker (q/Q: Quit)', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break # for next frame init_shape = preds else: cv2.imshow('Face Tracker (q/Q: Quit)', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break else: break # When everything done, release the capture cap.release() cv2.destroyAllWindows()
def _make_graph(self): self.logger.info("Generating training graph on {} GPUs ...".format( self.cfg.nr_gpus)) weights_initializer = slim.xavier_initializer() biases_initializer = tf.constant_initializer(0.) biases_regularizer = tf.no_regularizer weights_regularizer = tf.contrib.layers.l2_regularizer( self.cfg.weight_decay) tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(self.cfg.nr_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as name_scope: # Force all Variables to reside on the CPU. with slim.arg_scope( [slim.model_variable, slim.variable], device='/device:CPU:0'): with slim.arg_scope([slim.conv2d, slim.conv2d_in_plane, \ slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], weights_regularizer=weights_regularizer, biases_regularizer=biases_regularizer, weights_initializer=weights_initializer, biases_initializer=biases_initializer): # loss over single GPU self.net.make_network(is_train=True) if i == self.cfg.nr_gpus - 1: loss = self.net.get_loss(include_wd=True) else: loss = self.net.get_loss() self._input_list.append(self.net.get_inputs()) tf.get_variable_scope().reuse_variables() if i == 0: if self.cfg.nr_gpus > 1 and self.cfg.bn_train is True: self.logger.warning( "BN is calculated only on single GPU.") extra_update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) with tf.control_dependencies(extra_update_ops): grads = self._optimizer.compute_gradients(loss) else: grads = self._optimizer.compute_gradients(loss) final_grads = [] with tf.variable_scope('Gradient_Mult') as scope: for grad, var in grads: scale = 1. if self.cfg.double_bias and '/biases:' in var.name: scale *= 2. if not np.allclose(scale, 1.): grad = tf.multiply(grad, scale) final_grads.append((grad, var)) tower_grads.append(final_grads) if len(tower_grads) > 1: grads = sum_gradients(tower_grads) else: grads = tower_grads[0] if False: variable_averages = tf.train.ExponentialMovingAverage(0.9999) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply( variables_to_average) apply_gradient_op = self._optimizer.apply_gradients(grads) train_op = tf.group(apply_gradient_op, variables_averages_op, *extra_update_ops) else: apply_gradient_op = self._optimizer.apply_gradients(grads) train_op = tf.group(apply_gradient_op, *extra_update_ops) return train_op
def train(scope=''): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) train_dirs = FLAGS.datasets.split(':') # Calculate the learning rate schedule. num_batches_per_epoch = 100 num_epochs_per_decay = 5 decay_steps = int(num_batches_per_epoch * num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads _images, _shapes, _reference_shape, pca_model = \ data_provider.load_images(train_dirs) reference_shape = tf.constant(_reference_shape, dtype=tf.float32, name='reference_shape') image_shape = _images[0].shape lms_shape = _shapes[0].points.shape def get_random_sample(rotation_stddev=10): idx = np.random.randint(low=0, high=len(_images)) im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False) lms = _shapes[idx] im.landmarks['PTS'] = lms if np.random.rand() < .5: im = utils.mirror_image(im) if np.random.rand() < .5: theta = np.random.normal(scale=rotation_stddev) rot = menpo.transform.rotate_ccw_about_centre(lms, theta) im = im.warp_to_shape(im.shape, rot) pixels = im.pixels.transpose(1, 2, 0).astype('float32') shape = im.landmarks['PTS'].lms.points.astype('float32') return pixels, shape image, shape = tf.py_func(get_random_sample, [], [tf.float32, tf.float32]) initial_shape = data_provider.random_shape(shape, reference_shape, pca_model) image.set_shape(image_shape) shape.set_shape(lms_shape) initial_shape.set_shape(lms_shape) image = data_provider.distort_color(image) images, lms, inits = tf.train.batch([image, shape, initial_shape], FLAGS.batch_size, dynamic_pad=False, capacity=5000, enqueue_many=False, num_threads=num_preprocess_threads, name='batch') print('Defining model...') with tf.device(FLAGS.train_device): # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) predictions, dxs, _ = mdm_model.model(images, inits) total_loss = 0 for i, dx in enumerate(dxs): norm_error = mdm_model.normalized_rmse(dx + inits, lms) tf.histogram_summary('errors', norm_error) loss = tf.reduce_mean(norm_error) total_loss += loss summaries.append(tf.scalar_summary('losses/step_{}'.format(i), loss)) # Calculate the gradients for the batch of data grads = opt.compute_gradients(total_loss) summaries.append(tf.scalar_summary('losses/total', total_loss)) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, predictions], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms], [tf.float32]) summary = tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5) summaries.append(tf.histogram_summary('dx', predictions - inits)) summaries.append(summary) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. # NOTE: Currently we are not using batchnorm in MDM. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Build an initialization operation to run below. init = tf.initialize_all_variables() print('Initializing variables...') sess.run(init) print('Initialized variables.') if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir) print('Starting training...') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 50 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def __init__(self): """ initialize bisenetv2 trainner """ # define solver params and dataset self._carla_io = carla_tf_io.CarlaTfIO() self._train_dataset = self._carla_io.train_dataset_reader self._steps_per_epoch = len(self._train_dataset) self._model_name = CFG.MODEL.MODEL_NAME self._train_epoch_nums = CFG.TRAIN.EPOCH_NUMS self._batch_size = CFG.TRAIN.BATCH_SIZE self._snapshot_epoch = CFG.TRAIN.SNAPSHOT_EPOCH self._model_save_dir = ops.join(CFG.TRAIN.MODEL_SAVE_DIR, self._model_name) self._tboard_save_dir = ops.join(CFG.TRAIN.TBOARD_SAVE_DIR, self._model_name) self._enable_miou = CFG.TRAIN.COMPUTE_MIOU.ENABLE if self._enable_miou: self._record_miou_epoch = CFG.TRAIN.COMPUTE_MIOU.EPOCH self._input_tensor_size = [ int(tmp / 2) for tmp in CFG.AUG.TRAIN_CROP_SIZE ] self._init_learning_rate = CFG.SOLVER.LR self._moving_ave_decay = CFG.SOLVER.MOVING_AVE_DECAY self._momentum = CFG.SOLVER.MOMENTUM self._lr_polynimal_decay_power = CFG.SOLVER.LR_POLYNOMIAL_POWER self._optimizer_mode = CFG.SOLVER.OPTIMIZER.lower() if CFG.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: self._initial_weight = CFG.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH else: self._initial_weight = None if CFG.TRAIN.WARM_UP.ENABLE: self._warmup_epoches = CFG.TRAIN.WARM_UP.EPOCH_NUMS self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: self._warmup_epoches = 0 # define tensorflow session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor with tf.variable_scope(name_or_scope='graph_input_node'): self._input_src_image, self._input_label_image = self._train_dataset.next_batch( batch_size=self._batch_size) # define model loss self._model = bisenet_v2.BiseNetV2(phase='train', cfg=CFG) loss_set = self._model.compute_loss( input_tensor=self._input_src_image, label_tensor=self._input_label_image, name='BiseNetV2', reuse=False) self._prediciton = self._model.inference( input_tensor=self._input_src_image, name='BiseNetV2', reuse=True) self._loss = loss_set['total_loss'] self._l2_loss = loss_set['l2_loss'] # define miou if self._enable_miou: with tf.variable_scope('miou'): pred = tf.reshape(self._prediciton, [ -1, ]) gt = tf.reshape(self._input_label_image, [ -1, ]) indices = tf.squeeze( tf.where(tf.less_equal(gt, CFG.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=CFG.DATASET.NUM_CLASSES) # define learning rate with tf.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') warmup_steps = tf.constant(self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps') train_steps = tf.constant(self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps') self._learn_rate = tf.cond( pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr( warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000001, power=self._lr_polynimal_decay_power)) self._learn_rate = tf.identity(self._learn_rate, 'lr') global_step_update = tf.assign_add(self._global_step, 1.0) # define moving average op with tf.variable_scope(name_or_scope='moving_avg'): if CFG.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # define training op with tf.variable_scope(name_or_scope='train_step'): if CFG.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() if self._optimizer_mode == 'sgd': optimizer = tf.train.MomentumOptimizer( learning_rate=self._learn_rate, momentum=self._momentum) elif self._optimizer_mode == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self._learn_rate, ) else: raise ValueError('Not support optimizer: {:s}'.format( self._optimizer_mode)) optimize_op = optimizer.minimize(self._loss, var_list=train_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies([optimize_op, global_step_update]): with tf.control_dependencies([moving_ave_op]): self._train_op = tf.no_op() # define saver and loader with tf.variable_scope('loader_and_saver'): self._net_var = [ vv for vv in tf.global_variables() if 'lr' not in vv.name ] self._loader = tf.train.Saver(self._net_var) self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # define summary with tf.variable_scope('summary'): summary_merge_list = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total", self._loss), tf.summary.scalar('l2_loss', self._l2_loss) ] if self._enable_miou: with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total", self._loss), tf.summary.scalar('l2_loss', self._l2_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge( summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): shutil.rmtree(self._tboard_save_dir) os.makedirs(self._tboard_save_dir, exist_ok=True) model_params_file_save_path = ops.join( self._tboard_save_dir, CFG.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: CFG.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) self._summary_writer = tf.summary.FileWriter( self._tboard_save_dir, graph=self._sess.graph) LOG.info('Initialize carla bisenetv2 trainner complete')
def test_restore_ema(self): # Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3 x_data = np.random.rand(100).astype(np.float32) y_data = x_data * 0.1 + 0.3 # Try to find values for W and b that compute y_data = W * x_data + b # (We know that W should be 0.1 and b 0.3, but TensorFlow will # figure that out for us.) W = tf.Variable(tf.random_uniform([1], -1.0, 1.0), name='W') b = tf.Variable(tf.zeros([1]), name='b') y = W * x_data + b # Minimize the mean squared errors. loss = tf.reduce_mean(tf.square(y - y_data)) optimizer = tf.train.GradientDescentOptimizer(0.5) opt_op = optimizer.minimize(loss) # Track the moving averages of all trainable variables. ema = tf.train.ExponentialMovingAverage(decay=0.9999) averages_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([opt_op]): train_op = tf.group(averages_op) # Before starting, initialize the variables. We will 'run' this first. init = tf.global_variables_initializer() saver = tf.train.Saver(tf.trainable_variables()) # Launch the graph. sess = tf.Session() sess.run(init) # Fit the line. for _ in range(201): sess.run(train_op) w_reference = sess.run('W/ExponentialMovingAverage:0') b_reference = sess.run('b/ExponentialMovingAverage:0') saver.save(sess, os.path.join(self.tmp_dir, "model_ex1")) tf.reset_default_graph() tf.train.import_meta_graph(os.path.join(self.tmp_dir, "model_ex1.meta")) sess = tf.Session() print('------------------------------------------------------') for var in tf.global_variables(): print('all variables: ' + var.op.name) for var in tf.trainable_variables(): print('normal variable: ' + var.op.name) for var in tf.moving_average_variables(): print('ema variable: ' + var.op.name) print('------------------------------------------------------') mode = 1 restore_vars = {} if mode == 0: ema = tf.train.ExponentialMovingAverage(1.0) for var in tf.trainable_variables(): print('%s: %s' % (ema.average_name(var), var.op.name)) restore_vars[ema.average_name(var)] = var elif mode == 1: for var in tf.trainable_variables(): ema_name = var.op.name + '/ExponentialMovingAverage' print('%s: %s' % (ema_name, var.op.name)) restore_vars[ema_name] = var saver = tf.train.Saver(restore_vars, name='ema_restore') saver.restore(sess, os.path.join(self.tmp_dir, "model_ex1")) w_restored = sess.run('W:0') b_restored = sess.run('b:0') self.assertAlmostEqual(w_reference, w_restored, 'Restored model modes not use the EMA filtered weight') self.assertAlmostEqual(b_reference, b_restored, 'Restored model modes not use the EMA filtered bias')
def train_shadownet_multi_gpu(dataset_dir, weights_path, char_dict_path, ord_map_dict_path): """ :param dataset_dir: :param weights_path: :param char_dict_path: :param ord_map_dict_path: :return: """ # prepare dataset information train_dataset = shadownet_data_feed_pipline.CrnnDataFeeder( dataset_dir=dataset_dir, char_dict_path=char_dict_path, ord_map_dict_path=ord_map_dict_path, flags='train') val_dataset = shadownet_data_feed_pipline.CrnnDataFeeder( dataset_dir=dataset_dir, char_dict_path=char_dict_path, ord_map_dict_path=ord_map_dict_path, flags='val') train_images, train_labels, train_images_paths = train_dataset.inputs( batch_size=CFG.TRAIN.BATCH_SIZE) val_images, val_labels, val_images_paths = val_dataset.inputs( batch_size=CFG.TRAIN.BATCH_SIZE) # set crnn net shadownet = crnn_net.ShadowNet(phase='train', hidden_nums=CFG.ARCH.HIDDEN_UNITS, layers_nums=CFG.ARCH.HIDDEN_LAYERS, num_classes=CFG.ARCH.NUM_CLASSES) shadownet_val = crnn_net.ShadowNet(phase='test', hidden_nums=CFG.ARCH.HIDDEN_UNITS, layers_nums=CFG.ARCH.HIDDEN_LAYERS, num_classes=CFG.ARCH.NUM_CLASSES) # set average container tower_grads = [] train_tower_loss = [] val_tower_loss = [] batchnorm_updates = None train_summary_op_updates = None # set lr global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay( learning_rate=CFG.TRAIN.LEARNING_RATE, global_step=global_step, decay_steps=CFG.TRAIN.LR_DECAY_STEPS, decay_rate=CFG.TRAIN.LR_DECAY_RATE, staircase=CFG.TRAIN.LR_STAIRCASE) # set up optimizer optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) # set distributed train op with tf.variable_scope(tf.get_variable_scope()): is_network_initialized = False for i in range(CFG.TRAIN.GPU_NUM): with tf.device('/gpu:{:d}'.format(i)): with tf.name_scope('tower_{:d}'.format(i)) as _: train_loss, grads = compute_net_gradients( train_images, train_labels, shadownet, optimizer, is_net_first_initialized=is_network_initialized) is_network_initialized = True # Only use the mean and var in the first gpu tower to update the parameter # TODO implement batch normalization for distributed device ([email protected]) if i == 0: batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) train_summary_op_updates = tf.get_collection( tf.GraphKeys.SUMMARIES) tower_grads.append(grads) train_tower_loss.append(train_loss) with tf.name_scope('validation_{:d}'.format(i)) as _: val_loss, _ = compute_net_gradients( val_images, val_labels, shadownet_val, optimizer, is_net_first_initialized=is_network_initialized) val_tower_loss.append(val_loss) grads = average_gradients(tower_grads) avg_train_loss = tf.reduce_mean(train_tower_loss) avg_val_loss = tf.reduce_mean(val_tower_loss) # Track the moving averages of all trainable variables variable_averages = tf.train.ExponentialMovingAverage( CFG.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = tf.trainable_variables( ) + tf.moving_average_variables() variables_averages_op = variable_averages.apply(variables_to_average) # Group all the op needed for training batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # set tensorflow summary tboard_save_path = 'tboard/crnn_syn90k_multi_gpu' os.makedirs(tboard_save_path, exist_ok=True) summary_writer = tf.summary.FileWriter(tboard_save_path) avg_train_loss_scalar = tf.summary.scalar(name='average_train_loss', tensor=avg_train_loss) avg_val_loss_scalar = tf.summary.scalar(name='average_val_loss', tensor=avg_val_loss) learning_rate_scalar = tf.summary.scalar(name='learning_rate_scalar', tensor=learning_rate) train_merge_summary_op = tf.summary.merge( [avg_train_loss_scalar, learning_rate_scalar] + train_summary_op_updates) val_merge_summary_op = tf.summary.merge([avg_val_loss_scalar]) # set tensorflow saver saver = tf.train.Saver() model_save_dir = 'model/crnn_syn90k_multi_gpu' os.makedirs(model_save_dir, exist_ok=True) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # set sess config sess_config = tf.ConfigProto(device_count={'GPU': CFG.TRAIN.GPU_NUM}, allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' # Set the training parameters train_epochs = CFG.TRAIN.EPOCHS logger.info('Global configuration is as follows:') logger.info(CFG) sess = tf.Session(config=sess_config) summary_writer.add_graph(sess.graph) with sess.as_default(): tf.train.write_graph( graph_or_graph_def=sess.graph, logdir='', name='{:s}/shadownet_model.pb'.format(model_save_dir)) if weights_path is None: logger.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: logger.info('Restore model from last model checkpoint {:s}'.format( weights_path)) saver.restore(sess=sess, save_path=weights_path) train_cost_time_mean = [] val_cost_time_mean = [] for epoch in range(train_epochs): # training part t_start = time.time() _, train_loss_value, train_summary, lr = \ sess.run(fetches=[train_op, avg_train_loss, train_merge_summary_op, learning_rate]) if math.isnan(train_loss_value): raise ValueError('Train loss is nan') cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) summary_writer.add_summary(summary=train_summary, global_step=epoch) # validation part t_start_val = time.time() val_loss_value, val_summary = \ sess.run(fetches=[avg_val_loss, val_merge_summary_op]) summary_writer.add_summary(val_summary, global_step=epoch) cost_time_val = time.time() - t_start_val val_cost_time_mean.append(cost_time_val) if epoch % CFG.TRAIN.DISPLAY_STEP == 0: logger.info('Epoch_Train: {:d} total_loss= {:6f} ' 'lr= {:6f} mean_cost_time= {:5f}s '.format( epoch + 1, train_loss_value, lr, np.mean(train_cost_time_mean))) train_cost_time_mean.clear() if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0: logger.info('Epoch_Val: {:d} total_loss= {:6f} ' ' mean_cost_time= {:5f}s '.format( epoch + 1, val_loss_value, np.mean(val_cost_time_mean))) val_cost_time_mean.clear() if epoch % 5000 == 0: saver.save(sess=sess, save_path=model_save_path, global_step=epoch) sess.close() return
def train(retrain=False, retrain_list=None): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) # Get images and labels for CIFAR-10. images, labels = cifar10.distorted_inputs() # Build a Graph that computes the logits predictions from the # inference model. logits = cifar10.inference(images) # Calculate loss. loss = cifar10.loss(logits, labels) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. if not retrain: train_op = cifar10.train(loss, global_step) else: if retrain_count == 1: train_op = cifar10.train(loss, global_step, ["softmax_linear"]) else: train_op = cifar10.train(loss, global_step, ["softmax_linear", "local4"]) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() ### RETRAINING START if FLAGS.retrain: if FLAGS.debug: print( "GLOBAL =============================================================================" ) for v in tf.all_variables(): print(v.name) print( "TRAINABLE =============================================================================" ) for v in tf.trainable_variables(): print(v.name) print( "MOVING AVERAGES =============================================================================" ) for v in tf.moving_average_variables(): print(v.name) variables_to_restore = [ v for v in tf.global_variables() if not v.name.split('/')[0] in retrain_list ] variables_to_initialize = [ v for v in tf.global_variables() if v.name.split('/')[0] in retrain_list ] if FLAGS.debug: print( "RESTORE =============================================================================" ) for v in variables_to_restore: print(v.name) print( "INITIALIZE =============================================================================" ) for v in variables_to_initialize: print(v.name) saver_retrain = tf.train.Saver(variables_to_restore) ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not (ckpt and ckpt.model_checkpoint_path): print('Yikes! No checkpoint file found at %s to retrain :-(' % (FLAGS.checkpoint_dir)) return # Build an initialization operation to run below. init = tf.variables_initializer(variables_to_initialize) else: # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) if FLAGS.retrain: # Restores from checkpoint saver_retrain.restore(sess, ckpt.model_checkpoint_path) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) if FLAGS.print_params: print(tf.all_variables()[2].name) print(tf.all_variables()[2].eval(session=sess)) print(tf.all_variables()[9].name) print(tf.all_variables()[9].eval(session=sess)) print(tf.all_variables()[10].name) print(tf.all_variables()[10].eval(session=sess)) print("-------------------------------------------") for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if FLAGS.print_params: print(tf.all_variables()[2].name) print(tf.all_variables()[2].eval(session=sess)) print(tf.all_variables()[9].name) print(tf.all_variables()[9].eval(session=sess)) print(tf.all_variables()[10].name) print(tf.all_variables()[10].eval(session=sess)) print("-------------------------------------------") if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def model_fn(self, features, labels, mode, params): """Build the model based on features, labels, and mode. Args: features: The features dictionary containing the data Tensor and the number of examples. labels: The labels Tensor resulting from calling the model. mode: A string indicating the training mode. params: A dictionary of hyperparameters. Returns: A tf.estimator.EstimatorSpec. """ del params is_training = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) is_predict = (mode == tf.estimator.ModeKeys.PREDICT) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC loss, logits = self._build_network(features, labels, mode) if is_predict: predictions = {'logits': logits} if self.hparams.use_tpu: return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions) else: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) host_call = None train_op = None if is_training: global_step = tf.train.get_or_create_global_step() gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1]) # Setup learning rate schedule learning_rate = self._build_learning_rate_schedule(global_step) # Setup optimizer. optimizer = self._build_optimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = self._build_train_op(optimizer, loss, global_step=global_step) if self.hparams.moving_average_decay > 0: ema = tf.train.ExponentialMovingAverage( decay=self.hparams.moving_average_decay, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]): with tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) lr_t = tf.reshape(learning_rate, [1]) host_call = None if self.hparams.enable_hostcall: def host_call_fn(gs, lr): # Outfeed supports int32 but global_step is expected to be int64. gs = tf.cast(tf.reduce_mean(gs), tf.int64) with tf.contrib.summary.create_file_writer( self.model_dir).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) return tf.contrib.summary.all_summary_ops() host_call = (host_call_fn, [gs_t, lr_t]) eval_metrics = None eval_metric_ops = None if eval_active: def metric_fn(labels, logits): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" # Outfeed supports int32 but global_step is expected to be int64. predictions = tf.argmax(logits, axis=1) categorical_labels = labels top_1_accuracy = tf.metrics.accuracy(categorical_labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, categorical_labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) eval_metric_ops = metric_fn(labels, logits) if self.hparams.use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def train(scope=''): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:0'): # Global steps tf_global_step = tf.get_variable( 'GlobalStep', [], initializer=tf.constant_initializer(0), trainable=False) # Learning rate tf_lr = tf.train.exponential_decay(g_config['learning_rate'], tf_global_step, g_config['learning_rate_step'], g_config['learning_rate_decay'], staircase=True, name='LearningRate') tf.summary.scalar('learning_rate', tf_lr) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(tf_lr) data_provider.prepare_images(g_config['train_dataset'].split(':'), num_patches=g_config['num_patches'], verbose=True) path_base = Path(g_config['train_dataset'].split(':')[0]).parent.parent _mean_shape = mio.import_pickle(path_base / 'reference_shape.pkl') with Path(path_base / 'meta.txt').open('r') as ifs: _image_shape = [int(x) for x in ifs.read().split(' ')] assert (isinstance(_mean_shape, np.ndarray)) _pca_shapes = [] _pca_bbs = [] for item in tf.io.tf_record_iterator(str(path_base / 'pca.bin')): example = tf.train.Example() example.ParseFromString(item) _pca_shape = np.array(example.features.feature['pca/shape']. float_list.value).reshape((-1, 2)) _pca_bb = np.array( example.features.feature['pca/bb'].float_list.value).reshape( (-1, 2)) _pca_shapes.append(PointCloud(_pca_shape)) _pca_bbs.append(PointCloud(_pca_bb)) _pca_model = detect.create_generator(_pca_shapes, _pca_bbs) assert (_mean_shape.shape[0] == g_config['num_patches']) tf_mean_shape = tf.constant(_mean_shape, dtype=tf.float32, name='MeanShape') def decode_feature(serialized): feature = { 'train/image': tf.FixedLenFeature([], tf.string), 'train/shape': tf.VarLenFeature(tf.float32), } features = tf.parse_single_example(serialized, features=feature) decoded_image = tf.decode_raw(features['train/image'], tf.float32) decoded_image = tf.reshape(decoded_image, _image_shape) decoded_shape = tf.sparse.to_dense(features['train/shape']) decoded_shape = tf.reshape(decoded_shape, (g_config['num_patches'], 2)) return decoded_image, decoded_shape def get_random_sample(image, shape, rotation_stddev=10): # Read a random image with landmarks and bb image = menpo.image.Image(image.transpose((2, 0, 1)), copy=False) image.landmarks['PTS'] = PointCloud(shape) if np.random.rand() < .5: image = utils.mirror_image(image) if np.random.rand() < .5: theta = np.random.normal(scale=rotation_stddev) rot = menpo.transform.rotate_ccw_about_centre( image.landmarks['PTS'], theta) image = image.warp_to_shape(image.shape, rot) bb = image.landmarks['PTS'].bounding_box().points miny, minx = np.min(bb, 0) maxy, maxx = np.max(bb, 0) bbsize = max(maxx - minx, maxy - miny) center = [(miny + maxy) / 2., (minx + maxx) / 2.] image.landmarks['bb'] = PointCloud([ [center[0] - bbsize * 0.5, center[1] - bbsize * 0.5], [center[0] + bbsize * 0.5, center[1] + bbsize * 0.5], ]).bounding_box() proportion = float(np.random.rand() / 3) image = image.crop_to_landmarks_proportion(proportion, group='bb') image = image.resize((112, 112)) random_image = image.pixels.transpose(1, 2, 0).astype('float32') random_shape = image.landmarks['PTS'].points.astype('float32') return random_image, random_shape def get_init_shape(image, shape, mean_shape): def norm(x): return tf.sqrt( tf.reduce_sum(tf.square(x - tf.reduce_mean(x, 0)))) with tf.name_scope('align_shape_to_bb', values=[mean_shape]): min_xy = tf.reduce_min(mean_shape, 0) max_xy = tf.reduce_max(mean_shape, 0) min_x, min_y = min_xy[0], min_xy[1] max_x, max_y = max_xy[0], max_xy[1] mean_shape_bb = tf.stack([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) bb = tf.stack([[0.0, 0.0], [112.0, 0.0], [112.0, 112.0], [0.0, 112.0]]) ratio = norm(bb) / norm(mean_shape_bb) initial_shape = tf.add( (mean_shape - tf.reduce_mean(mean_shape_bb, 0)) * ratio, tf.reduce_mean(bb, 0), name='initial_shape') initial_shape.set_shape(tf_mean_shape.get_shape()) return image, shape, initial_shape def distort_color(image, shape, init_shape): return data_provider.distort_color(image), shape, init_shape with tf.name_scope('DataProvider', values=[tf_mean_shape]): tf_dataset = tf.data.TFRecordDataset( [str(path_base / 'train.bin')]) tf_dataset = tf_dataset.repeat() tf_dataset = tf_dataset.map(decode_feature) tf_dataset = tf_dataset.map(lambda x, y: tf.py_func( get_random_sample, [x, y], [tf.float32, tf.float32], stateful=True, name='RandomSample')) tf_dataset = tf_dataset.map( partial(get_init_shape, mean_shape=tf_mean_shape)) tf_dataset = tf_dataset.map(distort_color) tf_dataset = tf_dataset.batch(g_config['batch_size'], True) tf_dataset = tf_dataset.prefetch(7500) tf_iterator = tf_dataset.make_one_shot_iterator() tf_images, tf_shapes, tf_initial_shapes = tf_iterator.get_next( name='Batch') tf_images.set_shape([g_config['batch_size'], 112, 112, 3]) tf_shapes.set_shape([g_config['batch_size'], 73, 2]) tf_initial_shapes.set_shape([g_config['batch_size'], 73, 2]) print('Defining model...') with tf.device(g_config['train_device']): tf_model = mdm_model.MDMModel( tf_images, tf_shapes, tf_initial_shapes, batch_size=g_config['batch_size'], num_iterations=g_config['num_iterations'], num_patches=g_config['num_patches'], patch_shape=(g_config['patch_size'], g_config['patch_size']), num_channels=3) with tf.name_scope('Losses', values=[tf_model.prediction, tf_shapes]): tf_norm_error = tf_model.normalized_rmse( tf_model.prediction, tf_shapes) tf_loss = tf.reduce_mean(tf_norm_error) tf.summary.scalar('losses/total', tf_loss) # Calculate the gradients for the batch of data tf_grads = opt.compute_gradients(tf_loss) tf.summary.histogram('dx', tf_model.prediction - tf_shapes) bn_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope) # Add histograms for gradients. for grad, var in tf_grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) # Apply the gradients to adjust the shared variables. with tf.name_scope('Optimizer', values=[tf_grads, tf_global_step]): apply_gradient_op = opt.apply_gradients(tf_grads, global_step=tf_global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. with tf.name_scope('MovingAverage', values=[tf_global_step]): variable_averages = tf.train.ExponentialMovingAverage( g_config['MOVING_AVERAGE_DECAY'], tf_global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply( variables_to_average) # Group all updates to into a single train op. bn_updates_op = tf.group(*bn_updates, name='BNGroup') train_op = tf.group(apply_gradient_op, variables_averages_op, bn_updates_op, name='TrainGroup') # Create a saver. saver = tf.train.Saver() # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Build an initialization operation to run below. init = tf.global_variables_initializer() print('Initializing variables...') sess.run(init) print('Initialized variables.') start_step = 0 ckpt = tf.train.get_checkpoint_state(g_config['train_dir']) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /ckpt/train/model.ckpt-0, # extract global_step from it. start_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) + 1 print('%s: Pre-trained model restored from %s' % (datetime.now(), g_config['train_dir'])) summary_writer = tf.summary.FileWriter(g_config['train_dir'], sess.graph) print('Starting training...') for step in range(start_step, g_config['max_steps']): start_time = time.time() _, loss_value = sess.run([train_op, tf_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 100 == 0: examples_per_sec = g_config['batch_size'] / float(duration) format_str = ( '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 200 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == g_config['max_steps']: checkpoint_path = os.path.join(g_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def Test_Mixup_ResNet(): image_w, image_h, image_c = [32, 32, 3] path_mom = "/scratch/mixup" path_son_all = os.listdir(path_mom) path_selec = [ v for v in path_son_all if 'Iter' in v and '_preactivation' not in v ] print(path_selec) MOVING_AVERAGE_DECAY = 0.999 NUM_CLASS = 10 TEST_PATH = ["/scratch/mixup/eval.tfrecords"] batch_size = 1 num_image = 10000 with tf.Graph().as_default(): images_test = tf.placeholder(tf.float32, [batch_size, image_w, image_h, image_c]) labels_test = tf.placeholder(tf.int64, [batch_size]) phase_train = tf.placeholder(tf.bool, shape=None) Selec_Layer_Index = tf.placeholder(tf.int64) input_lambda_tensor = tf.placeholder(tf.float32, shape=[1]) image_batch_te, label_batch_te = Read_Data_from_Record( TEST_PATH, batch_size, augmentation=False, shuffle=False) logits, target = build_tower_basic(images_test, labels_test, input_lambda_tensor, NUM_CLASS, phase_train, Selec_Layer_Index) Data_Error_Loss, Init_Accu, Reweight_Accu = Calc_Loss( logits, labels_test, target, input_lambda_tensor) var_train = tf.trainable_variables() variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY) variable_averages.apply(var_train) variables_to_restore = variable_averages.variables_to_restore( tf.moving_average_variables()) saver = tf.train.Saver(variables_to_restore) for single_ckpt in path_selec: ckpt_dir = os.path.join(path_mom, single_ckpt) print("\n================================") with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("restore parameter from ", ckpt.model_checkpoint_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) Total_Test_Stat = np.zeros([num_image, 2 ]) #loss, init_accu, reweight_accu print( "-------------------------Start Training-------------------------------------" ) for step in range(num_image): selected_layer_index = 3 input_lambda = 1 #print("----------------Input lambda is--------", input_lambda) image_test_batch, label_test_batch = sess.run( [image_batch_te, label_batch_te]) feed_dict = { images_test: image_test_batch, labels_test: label_test_batch, phase_train: False, Selec_Layer_Index: selected_layer_index, input_lambda_tensor: [input_lambda] } Data_Error_Loss_Train, Init_Accu_Train = sess.run( [Data_Error_Loss, Init_Accu], feed_dict=feed_dict) Total_Test_Stat[step, :] = [ Data_Error_Loss_Train, Init_Accu_Train ] print("-------Test Error, Test Accu------------", np.mean(Total_Test_Stat, axis=0)) coord.request_stop() coord.join(threads)
def model_fn(features, labels, mode, params): """Mobilenet v1 model using Estimator API.""" num_classes = FLAGS.num_classes training_active = (mode == tf.estimator.ModeKeys.TRAIN) eval_active = (mode == tf.estimator.ModeKeys.EVAL) features = tensor_transform_fn(features, params['input_perm']) if FLAGS.clear_update_collections: # updates_collections must be set to None in order to use fused batchnorm with tf.variable_scope('cg', custom_getter=get_custom_getter()): with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=FLAGS.depth_multiplier) logits = tf.cast(logits, tf.float32) else: with tf.variable_scope('cg', custom_getter=get_custom_getter()): with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()): logits, end_points = mobilenet_v1.mobilenet_v1( features, num_classes, is_training=training_active, depth_multiplier=FLAGS.depth_multiplier) logits = tf.cast(logits, tf.float32) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print( predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print( labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) cross_entropy = tf.losses.softmax_cross_entropy( onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) # loss = tf.losses.get_total_loss(add_regularization_losses=True) loss = cross_entropy + 1e-4 * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name]) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 final_learning_rate = 0.0001 * initial_learning_rate train_op = None if training_active: batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=FLAGS.learning_rate_decay_epochs * batches_per_epoch, decay_rate=FLAGS.learning_rate_decay, staircase=True) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum( learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer( learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage( decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if eval_active: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy(labels, tf.argmax( input=predictions, axis=1)) return {'accuracy': accuracy} if FLAGS.use_logits: eval_predictions = logits else: eval_predictions = end_points['Predictions'] eval_metrics = (metric_fn, [labels, eval_predictions]) param_stats = tf.profiler.profile( tf.get_default_graph(), options=ProfileOptionBuilder.trainable_variables_parameter()) fl_stats = tf.profiler.profile( tf.get_default_graph(), options=tf.profiler.ProfileOptionBuilder.float_operation()) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def train(): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') print('PS hosts are: %s' % ps_hosts) print('Worker hosts are: %s' % worker_hosts) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() is_chief = (FLAGS.task_id == 0) if is_chief: if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) device_setter = tf.train.replica_device_setter(ps_tasks=len(ps_hosts)) with tf.device('/job:worker/task:%d' % FLAGS.task_id): partitioner = tf.fixed_size_partitioner(len(ps_hosts), axis=0) with tf.variable_scope('partitioned_space', partitioner=partitioner): with tf.device(device_setter): global_step = tf.Variable(0, trainable=False) decay_steps = 50000 * 350.0 / FLAGS.batch_size batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = cifar10.distorted_inputs(batch_size) inputs = tf.reshape(images, [-1, _HEIGHT, _WIDTH, _DEPTH]) labels = tf.one_hot(labels, 10, 1, 0) # network_fn = nets_factory.get_network_fn('alexnet_v2',num_classes=10) # (logits,_) = network_fn(inputs) # with slim.arg_scope(alexnet.alexnet_v2_arg_scope(weight_decay=0.0)): (logits, _) = alexnet.alexnet_v2(inputs, num_classes=10, is_training=True) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE * len(worker_hosts), global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.GradientDescentOptimizer(lr) # Track the moving averages of all trainable variables. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=len(worker_hosts), total_num_replicas=len(worker_hosts), variable_averages=exp_moving_averager, variables_to_average=variables_to_average) naive_grads = opt.compute_gradients(loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads] apply_gradients_op = opt.apply_gradients( grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(loss, name='train_op') chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()), summary_op=None, global_step=global_step, saver=saver, recovery_wait_secs=1, save_model_secs=60) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess_config.gpu_options.allow_growth = True sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) """Train CIFAR-10 for a number of steps.""" time0 = time.time() batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size_num decay_steps_num = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) _, loss_value, gs = sess.run( [train_op, loss, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() if step % 1 == 0: duration = time.time() - start_time num_examples_per_step = batch_size_num examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))
def begin(self): """ Create restoring operations before the graph been finalized. """ ema_variables = tf.moving_average_variables() self._restore_ops = [ tf.assign(x, self._ema.average(x)) for x in ema_variables ]
def model_fn(self, features, labels, mode, params): """Build the model based on features, labels, and mode. Args: features: The features dictionary containing the data Tensor and the number of examples. labels: The labels Tensor resulting from calling the model. mode: A string indicating the training mode. params: A dictionary of hyperparameters. Returns: A tf.estimator.EstimatorSpec. """ del params is_training = (mode == tf.estimator.ModeKeys.TRAIN) if is_training: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC total_loss, outputs = self._build_network(features, labels, mode) devices = cluster_utils.get_pipeline_devices(FLAGS.pipeline_device_num) slice_num = len(devices) micro_batch_num = FLAGS.micro_batch_num losses = [] all_outputs = [] losses.append(total_loss) all_outputs.append(outputs) layer_grads = [[[] for i in xrange(slice_num)] for j in xrange(micro_batch_num)] layer_vars = [[] for i in xrange(slice_num)] remained_vars = tf.trainable_variables() ys = losses[0] prev_grads=None # layers-1 ~ 1 compute grads for i in xrange(slice_num - 1, 0, -1): vars_i = [v for v in remained_vars if v.device==devices[i]] remained_vars = [v for v in remained_vars if v not in vars_i] prev_y = all_outputs[0][i-1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) y_grads = tf.gradients(ys=ys, xs=prev_y+vars_i, grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[0][i] = [g for g in grads_i if g is not None] layer_vars[i] = [v for (g, v) in zip(grads_i, vars_i) if g is not None] # layer 0 compute grads grads_0 = tf.gradients(ys=ys, xs=remained_vars, grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[0][0] = [g for g in grads_0 if g is not None] layer_vars[0] = [v for (g, v) in zip(grads_0, remained_vars) if g is not None] # other micro_batch_num for j in xrange(1, micro_batch_num): dep_outputs = [] for i in xrange(slice_num): dep_outputs.append(all_outputs[j-1][i] if i+j < 2*slice_num-1 else layer_grads[i+j-2*slice_num+1][i]) loss, outputs = self._build_network(features, labels, mode, dep_outputs=dep_outputs) losses.append(loss) all_outputs.append(outputs) ys = losses[j] prev_grads=None for i in xrange(slice_num - 1, 0, -1): prev_y = all_outputs[j][i-1] prev_y = prev_y if isinstance(prev_y, list) else [prev_y] num_tensors = len(prev_y) y_grads = tf.gradients(ys=ys, xs=prev_y+layer_vars[i], grad_ys=prev_grads, colocate_gradients_with_ops=True) ys = prev_y prev_grads = y_grads[0:num_tensors] grads_i = y_grads[num_tensors:] layer_grads[j][i] = [g for g in grads_i if g is not None] grads_0 = tf.gradients(ys=ys, xs=layer_vars[0], grad_ys=prev_grads, colocate_gradients_with_ops=True) layer_grads[j][0] = [g for g in grads_0 if g is not None] grads_set = [] vars_set = [] for i in xrange(slice_num): for j in xrange(len(layer_grads[0][i])): grad_i_set = [layer_grads[m][i][j] for m in range(micro_batch_num)] #print (grad_i_set) if micro_batch_num == 1: with tf.device(grad_i_set[0].device): acc_grads = grad_i_set[0] else: with tf.control_dependencies(grad_i_set), tf.device(grad_i_set[0].device): # replica if isinstance(grad_i_set[0], tf.IndexedSlices): acc_grads = tf.add_n(grad_i_set) else: acc_grads = tf.accumulate_n(grad_i_set) grads_set.append(acc_grads) vars_set.append(layer_vars[i][j]) grads_and_vars = zip(grads_set, vars_set) ####################### train_op = None if is_training: global_step = tf.train.get_or_create_global_step() gs_t = tf.reshape(tf.cast(global_step, tf.int32), [1]) # Setup learning rate schedule learning_rate = self._build_learning_rate_schedule(global_step) # Setup optimizer. optimizer = self._build_optimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(None): # original is update_ops train_op = self._build_train_op(optimizer, grads_and_vars, global_step=global_step) if self.hparams.moving_average_decay > 0: ema = tf.train.ExponentialMovingAverage( decay=self.hparams.moving_average_decay, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op]): with tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) lr_t = tf.reshape(learning_rate, [1]) host_call = None if self.hparams.enable_hostcall: def host_call_fn(gs, lr): # Outfeed supports int32 but global_step is expected to be int64. gs = tf.cast(tf.reduce_mean(gs), tf.int64) with tf.contrib.summary.create_file_writer( self.model_dir).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) return tf.contrib.summary.all_summary_ops() host_call = (host_call_fn, [gs_t, lr_t]) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op)
def main(argv=None): # load config file and setup params = {} config = configparser.ConfigParser() config_file = "configurations/mv2_cpm.cfg" if len(argv) != 1: config_file = argv[1] config.read(config_file) for _ in config.options("Train"): params[_] = eval(config.get("Train", _)) os.environ['CUDA_VISIBLE_DEVICES'] = params['visible_devices'] gpus_index = params['visible_devices'].split(",") params['gpus'] = len(gpus_index) if not os.path.exists(params['modelpath']): os.makedirs(params['modelpath']) if not os.path.exists(params['logpath']): os.makedirs(params['logpath']) src.dataloaders.dataset.set_config(params) set_network_input_wh(params['input_width'], params['input_height']) if (config_file == argv[1]): set_network_scale(params['scale']) else: ## For the hourglass model the last layer outputs a 32 times smaller output ## which is upsampled later ## TODO : Understand the architecture and make necessary changes ## For now work with a scale value of 4 for the Hourglass model set_network_scale(4) ## Train on cpus for MAC gpus = 'gpus' if platform.system() == 'Darwin': gpus = 'cpu' training_name = '{}_batch-{}_lr-{}_{}-{}_{}x{}_{}'.format( params['model'], params['batchsize'], params['lr'], gpus, params['gpus'], params['input_width'], params['input_height'], config_file.replace("/", "-").replace(".cfg", "")) ## Processing for CPU ## Obtaining the dataset pipeline from dataloaders.datasets ## Define the learning rate and optimizer function with tf.Graph().as_default(), tf.device("/cpu:0"): input_image, input_heat = get_input(params['batchsize'], params['max_epoch'], is_train=True) valid_input_image, valid_input_heat = get_input(params['batchsize'], params['max_epoch'], is_train=False) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(float(params['lr']), global_step, decay_steps=10000, decay_rate=float( params['decay_rate']), staircase=True) opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-8) tower_grads = [] reuse_variable = False if platform.system() == 'Darwin': # cpu (mac only) with tf.device("/cpu:0"): with tf.name_scope("CPU_0"): loss, last_heat_loss, pred_heat = get_loss_and_output( params['model'], params['batchsize'], input_image, input_heat, reuse_variable, params['scale']) reuse_variable = True grads = opt.compute_gradients(loss) tower_grads.append(grads) valid_loss, valid_last_heat_loss, valid_pred_heat = get_loss_and_output( params['model'], params['batchsize'], valid_input_image, valid_input_heat, reuse_variable) else: # multiple gpus for i in range(params['gpus']): with tf.device("/gpu:%d" % i): with tf.name_scope("GPU_%d" % i): loss, last_heat_loss, pred_heat = get_loss_and_output( params['model'], params['batchsize'], input_image, input_heat, reuse_variable, params['scale']) reuse_variable = True grads = opt.compute_gradients(loss) tower_grads.append(grads) valid_loss, valid_last_heat_loss, valid_pred_heat = get_loss_and_output( params['model'], params['batchsize'], valid_input_image, valid_input_heat, reuse_variable, params['scale']) grads = average_gradients(tower_grads) for grad, var in grads: if grad is not None: tf.summary.histogram("gradients_on_average/%s" % var.op.name, grad) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) ## Update model parameters based on moving average rather than final values ## Better performace MOVING_AVERAGE_DECAY = 0.99 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variable_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variable_to_average) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.group(apply_gradient_op, variables_averages_op) saver = tf.train.Saver(max_to_keep=100) tf.summary.scalar("learning_rate", learning_rate) tf.summary.scalar("loss", loss) tf.summary.scalar("loss_lastlayer_heat", last_heat_loss) summary_merge_op = tf.summary.merge_all() pred_result_image = tf.placeholder( tf.float32, shape=[params['batchsize'], 480, 640, 3]) pred_result__summary = tf.summary.image("pred_result_image", pred_result_image, params['batchsize']) init = tf.global_variables_initializer() config = tf.ConfigProto() # occupy gpu gracefully config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: init.run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter( os.path.join(params['logpath'], training_name), sess.graph) total_step_num = params['num_train_samples'] * params[ 'max_epoch'] // (params['batchsize'] * params['gpus']) print("Start training...") for step in range(total_step_num): start_time = time.time() _, loss_value, lh_loss, in_image, in_heat, p_heat = sess.run([ train_op, loss, last_heat_loss, input_image, input_heat, pred_heat ]) duration = time.time() - start_time if step != 0 and step % params[ 'per_update_tensorboard_step'] == 0: # False will speed up the training time. if params['pred_image_on_tensorboard'] is True: valid_loss_value, valid_lh_loss, valid_in_image, valid_in_heat, valid_p_heat = sess.run( [ valid_loss, valid_last_heat_loss, valid_input_image, valid_input_heat, valid_pred_heat ]) ## TODO: Check why for the third iteration only 12 images are passed in validation batch result = [] for index in range(params['batchsize']): r = CocoPose.display_image( valid_in_image[index, :, :, :], valid_in_heat[index, :, :, :], valid_p_heat[index, :, :, :], True) result.append(r.astype(np.float32)) comparsion_of_pred_result = sess.run( pred_result__summary, feed_dict={pred_result_image: np.array(result)}) summary_writer.add_summary(comparsion_of_pred_result, step) # print train info num_examples_per_step = params['batchsize'] * params['gpus'] examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / params['gpus'] format_str = ( '%s: step %d, loss = %.2f, last_heat_loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), step, loss_value, lh_loss, examples_per_sec, sec_per_batch)) # tensorboard visualization merge_op = sess.run(summary_merge_op) summary_writer.add_summary(merge_op, step) # save model if step % params['per_saved_model_step'] == 0: checkpoint_path = os.path.join(params['modelpath'], training_name, 'model') saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads)
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are inferred from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) #batchSizeManager = BatchSizeManager(32, 4) # Ops are assigned to worker by default. tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers)) partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0) device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers) slim = tf.contrib.slim with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.variable_scope('root', partitioner=partitioner): # Variables and its related init/assign ops are assigned to ps. # with slim.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): with tf.device(device_setter): # partitioner=partitioner): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. # global_step = slim.variables.global_step() global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate*num_workers, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. # tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) print(images.get_shape()) print(labels.get_shape()) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. # num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() print(num_classes) # logits = inception.inference(images, num_classes, for_training=True) network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) (logits,_) = network_fn(images) print(logits.get_shape()) # Add classification loss. # inception.loss(logits, labels, batch_size) # Gather all of the losses including regularization losses. labels = tf.one_hot(labels, 1000, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) # losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. # for l in losses + [total_loss]: # loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. # tf.summary.scalar(loss_name + ' (raw)', l) # tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. # for var in variables_to_average: # tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) # batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) # assert batchnorm_updates, 'Batchnorm updates are missing' # batchnorm_updates_op = tf.group(*batchnorm_updates) # # Add dependency to compute batchnorm_updates. # with tf.control_dependencies([batchnorm_updates_op]): # total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. # grads = opt.compute_gradients(total_loss) grads0 = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] # Add histograms for gradients. # for grad, var in grads: # if grad is not None: # tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners and init_tokens, which is used to synchronize # replicas. More details can be found in SyncReplicasOptimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=None, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. # next_summary_time = time.time() + FLAGS.save_summaries_secs step = 0 time0 = time.time() batch_size_num = 1 while not sv.should_stop(): try: start_time = time.time() batch_size_num = 32 batch_size_num = 2*int(step/5)+16 # batch_size_num = int((int(step)/3*10)) % 100000 + 1 # if step < 5: # batch_size_num = 32 # batch_size_num = (batch_size_num ) % 64 + 1 # else: # batch_size_num = 80 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) thread.start() # tl = timeline.Timeline(run_metadata.step_stats) # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') c0 = time.time() # batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time) # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) # ctf = tl.generate_chrome_trace_format() # with open("timeline.json", 'a') as f: # f.write(ctf) if step % 1 == 0: examples_per_sec = FLAGS.batch_size / float(duration) c = time.time() tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num)) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. # if is_chief and next_summary_time < time.time(): # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. # next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('Chief got exception while running!') raise # Stop the supervisor. This also waits for service threads to finish. sv.stop()
def train(dataset): #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. tf.set_random_seed(time.time()) tf.set_random_seed(198918) global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) bits_ph = [] for i in range(18): bits_ph.append(tf.placeholder(tf.int32)) nm = norm_monitor.norm_monitor(FLAGS.digits, len(bits_ph), FLAGS.rel_res, FLAGS.interval, FLAGS.stride) if FLAGS.layerinfo_file: assert tf.gfile.Exists(FLAGS.layerinfo_file) tmp = pickle.load(open(FLAGS.layerinfo_file,'rb')) nm.set_layerinfo(tmp[-1]) print("Restore layerinfo") print(nm.get_layerinfo()) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) print("num_batches_per_epoch: {}".format(num_batches_per_epoch)) print("use bitpack: {}".format(FLAGS.use_bitpack)) print("learning rate: {}".format(FLAGS.initial_learning_rate)) print("produce trace: {}".format(FLAGS.profile)) print("digits: {}".format(FLAGS.digits)) print("rel_res: {}".format(FLAGS.rel_res)) print("interval: {}".format(FLAGS.interval)) print("stride: {}".format(FLAGS.stride)) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) # Calculate the gradients for each model tower. tower_norms = [] tower_grads = [] tower_preds_1 = [] tower_preds_5 = [] tower_losses = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. #print(images_splits[i]) #print(labels_splits[i]) loss, norms, logits_split = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables, bits_ph) top_1_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 1) top_5_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 5) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) tower_norms.append(norms) tower_preds_1.append(tf.reduce_sum(tf.cast(top_1_correct, tf.int32))) tower_preds_5.append(tf.reduce_sum(tf.cast(top_5_correct, tf.int32))) tower_losses.append(loss) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) top_1_sum = tf.add_n(tower_preds_1) top_5_sum = tf.add_n(tower_preds_5) losses_sum = tf.add_n(tower_losses) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) #variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(tf.global_variables(), max_to_keep=100) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) #for v in tf.all_variables(): # print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter( FLAGS.train_dir, graph=sess.graph) bits_dict = dict() #run_metadata = tf.RunMetadata() elapse = [] #gweights = [] glayerinfo = [] #wnp_name = 'weights_norm_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 2048, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack) lip_name = 'layerinfo_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 4096, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack) for step in range(FLAGS.max_steps): run_metadata = tf.RunMetadata() start_time = time.time() info = nm.get_layerinfo() for i, bits in enumerate(bits_ph): bits_dict[bits] = info[i][0] if FLAGS.profile is False: _, loss_value, norms, top_1, top_5 = sess.run([train_op, losses_sum, tower_norms, top_1_sum, top_5_sum], feed_dict=bits_dict) else: _, loss_value, norms = sess.run([train_op, loss, tower_norms], feed_dict=bits_dict, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) top_1 = 5 top_5 = 25 nm.adjust_digits(norms) duration = time.time() - start_time #gweights.append(norms) #glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) elapse.append(duration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) # Print layerinfo print(info) examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch) elapse %.5f s top_1 %.5f top_5 %.5f') pred_1 = top_1 / (FLAGS.batch_size*FLAGS.num_gpus) pred_5 = top_5 / (FLAGS.batch_size*FLAGS.num_gpus) print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration, sum(elapse), pred_1, pred_5)) sys.stdout.flush() tl = timeline.Timeline(run_metadata.step_stats) if FLAGS.profile is True: if FLAGS.use_bitpack is False: trace_file = tf.gfile.Open(name='timeline%03d.json' % step, mode='w') else: trace_file = tf.gfile.Open(name='bitpack_timeline%03d.json' % step, mode='w') trace_file.write(tl.generate_chrome_trace_format(show_memory=True)) if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict=bits_dict) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 4000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) #pickle.dump(gweights, open(wnp_name,'wb')) pickle.dump(glayerinfo, open(lip_name,'wb'))
def train(): assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker' ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are %s ' % ps_hosts) tf.logging.info('Worker hosts are %s ' % worker_hosts) cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) server = tf.train.Server(cluster_spec, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': server.join() else: """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are infered from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) # Ops are assigned to worker by default. with tf.device(tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % FLAGS.task_id, cluster=cluster_spec)): # Variables and its related init/assign ops are assigned to ps. # with slim.scopes.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. #global_step = slim.variables.global_step() global_step = tf.Variable(0, name='global_step', trainable=False) num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.scalar_summary('learning_rate', lr) opt = tf.train.GradientDescentOptimizer(lr) images, labels = image_two_stream.distorted_inputs() logits = image_two_stream.inference_final(images) total_loss = image_two_stream.loss(logits, labels) # train_op = image.train(loss, global_step) if is_chief: loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') losses = tf.get_collection('losses') loss_averages_op = loss_averages.apply(losses + [total_loss]) for l in losses + [total_loss]: # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(l.op.name + ' (raw)', l) tf.scalar_summary(l.op.name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = (tf.trainable_variables() + tf.moving_average_variables()) for var in variables_averages_op: tf.histogram_summary(var.op.name, var) opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, replica_id=FLAGS.task_id, total_num_replicas=num_workers, variable_averages=variable_averages, variables_to_average=variables_averages_op) #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) #assert batchnorm_updates, 'Batchnorm updates are missing' # batchnorm_updates_op = tf.group(*batchnorm_updates) ## Add dependency to compute batchnorm_updates. #with tf.control_dependencies([batchnorm_updates_op]): # total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners, init_tokens and clean_up_op, which is used to # synchronize replicas. # More details can be found in sync_replicas_optimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() clean_up_op = opt.get_clean_up_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init_op = tf.initialize_all_variables() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. next_summary_time = time.time() + FLAGS.save_summaries_secs while not sv.should_stop(): try: start_time = time.time() loss_value, step = sess.run([train_op, global_step]) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time if step % 30 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. if is_chief and next_summary_time < time.time(): tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('About to execute sync_clean_up_op!') sess.run(clean_up_op) raise # Stop the supervisor. This also waits for service threads to finish. sv.stop() # Save after the training ends. if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step) print("end")
def train(dataset): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) # Calculate the gradients for each model tower. tower_grads = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. with slim.arg_scope([slim.variables.variable], device='/cpu:0'): # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter( FLAGS.train_dir, graph=sess.graph) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(_): with tf.Graph().as_default(), tf.device('/cpu:0'): dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() /FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) tf.summary.scalar('lr', learning_rate) is_training = tf.placeholder(tf.bool) #opt = tf.train.AdamOptimizer(learning_rate) opt = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) with tf.name_scope("create_inputs"): #if tf.gfile.Exists(FLAGS.SNAPSHOT_DIR): # tf.gfile.DeleteRecursively(FLAGS.SNAPSHOT_DIR) #tf.gfile.MakeDirs(FLAGS.SNAPSHOT_DIR) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.gpu_nums == 0, ('Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.gpu_nums) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.gpu_nums images, labels = image_processing.distorted_inputs(dataset, num_preprocess_threads=num_preprocess_threads) #tf.summary.image('images', images, max_outputs = 10) images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=tf.one_hot(indices = labels, depth = FLAGS.num_classes)) multi_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.gpu_nums): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('ImageNet', i)) as scope: graph = Model_Graph(num_class = FLAGS.num_classes, is_training = is_training) model = graph._build_defaut_graph(images = images_splits[i]) # Top-1 accuracy top1acc = tf.reduce_mean(tf.cast(tf.nn.in_top_k(model.logits, tf.argmax(labels_splits[i], axis=1), 1), tf.float32)) # Top-n accuracy topnacc = tf.reduce_mean(tf.cast(tf.nn.in_top_k(model.logits, tf.argmax(labels_splits[i], axis=1), FLAGS.top_k), tf.float32)) tf.summary.scalar('top1acc_{}'.format(i), top1acc) tf.summary.scalar('topkacc_{}'.format(i), topnacc) all_trainable = [v for v in tf.trainable_variables()] loss = tf.nn.softmax_cross_entropy_with_logits(logits=model.logits, labels=labels_splits[i]) l2_losses = [FLAGS.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) tf.summary.scalar('loss_{}'.format(i), reduced_loss) tf.get_variable_scope().reuse_variables() #batchnorm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope) batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) grads = opt.compute_gradients(reduced_loss, all_trainable) multi_grads.append(grads) grads = average_gradients(multi_grads) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage(FLAGS.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(opt.apply_gradients(grads, global_step), variables_averages_op, batchnorm_updates_op) #grads_value = list(zip(grads, all_trainable)) #for grad, var in grads_value: # tf.summary.histogram(var.name + '/gradient', grad) summary_op = tf.summary.merge_all() # Set up tf session and initialize variables. config = tf.ConfigProto() config.allow_soft_placement=True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=2) restore_var = [v for v in tf.trainable_variables()]+[v for v in tf.global_variables() if 'moving_mean' in v.name or 'moving_variance' in v.name or 'global_step' in v.name] ckpt = tf.train.get_checkpoint_state(FLAGS.SNAPSHOT_DIR) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 summary_writer = tf.summary.FileWriter(FLAGS.SNAPSHOT_DIR, graph=sess.graph) # Iterate over training steps. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) for step in range(FLAGS.num_steps): start_time = time.time() feed_dict = {is_training: True} if step%50000 == 0 and step != 0: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) save(saver, sess, FLAGS.SNAPSHOT_DIR, step) elif step%100 == 0: summary_str, loss_value, _ = sess.run([summary_op, reduced_loss, train_op], feed_dict=feed_dict) duration = time.time() - start_time summary_writer.add_summary(summary_str, step) summary_writer.flush() print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format(step, loss_value, duration)) else: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) coord.request_stop() coord.join(threads)
def main(argv=None): # 将简单的运算放在CPU上,只有神经网络的训练过程放在GPU上。 with tf.Graph().as_default(), tf.device('/cpu:0'): # 定义基本的训练过程 x, y_ = get_input() regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay( LEARNING_RATE_BASE, global_step, 60000 / BATCH_SIZE, LEARNING_RATE_DECAY) opt = tf.train.GradientDescentOptimizer(learning_rate) tower_grads = [] reuse_variables = False # 将神经网络的优化过程跑在不同的GPU上。 for i in range(N_GPU): # 将优化过程指定在一个GPU上。 with tf.device('/gpu:%d' % i): with tf.name_scope('GPU_%d' % i) as scope: cur_loss = get_loss(x, y_, regularizer, scope, reuse_variables) reuse_variables = True grads = opt.compute_gradients(cur_loss) tower_grads.append(grads) # 计算变量的平均梯度。 grads = average_gradients(tower_grads) for grad, var in grads: if grad is not None: tf.histogram_summary('gradients_on_average/%s' % var.op.name, grad) # 使用平均梯度更新参数。 apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) # 计算变量的滑动平均值。 variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() +tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # 每一轮迭代需要更新变量的取值并更新变量的滑动平均值。 train_op = tf.group(apply_gradient_op, variables_averages_op) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() init = tf.initialize_all_variables() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) as sess: # 初始化所有变量并启动队列。 init.run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.train.SummaryWriter(MODEL_SAVE_PATH, sess.graph) for step in range(TRAINING_STEPS): # 执行神经网络训练操作,并记录训练操作的运行时间。 start_time = time.time() _, loss_value = sess.run([train_op, cur_loss]) duration = time.time() - start_time # 每隔一段时间数据当前的训练进度,并统计训练速度。 if step != 0 and step % 10 == 0: # 计算使用过的训练数据个数。 num_examples_per_step = BATCH_SIZE * N_GPU examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / N_GPU # 输出训练信息。 format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) # 通过TensorBoard可视化训练过程。 summary = sess.run(summary_op) summary_writer.add_summary(summary, step) # 每隔一段时间保存当前的模型。 if step % 1000 == 0 or (step + 1) == TRAINING_STEPS: checkpoint_path = os.path.join(MODEL_SAVE_PATH, MODEL_NAME) saver.save(sess, checkpoint_path, global_step=step) coord.request_stop() coord.join(threads) if __name__ == '__main__': tf.app.run()
def train(): import multiprocessing as mp mp.set_start_method('spawn', force=True) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.TRAIN.GPU_LIST gpus = list(range(len(cfg.TRAIN.GPU_LIST.split(',')))) num_gpus = len(gpus) restore_from_original_checkpoint = True checkpoint_path = cfg.TRAIN.LOG_DIR + COMMON_POSTFIX if not tf.io.gfile.exists(checkpoint_path): tf.io.gfile.makedirs(checkpoint_path) else: restore_from_original_checkpoint = False register_coco(os.path.expanduser(cfg.DATA.BASEDIR)) data_iter = get_train_dataflow(batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * num_gpus) ds = tf.data.Dataset.from_generator( lambda: map( lambda x: tuple([ x[k] for k in ['images', 'gt_boxes', 'gt_labels', 'orig_gt_counts'] ]), data_iter), (tf.float32, tf.float32, tf.int64, tf.int32), (tf.TensorShape([None, None, None, 3]), tf.TensorShape([ None, None, 4 ]), tf.TensorShape([None, None]), tf.TensorShape([ None, ]))) ds = ds.prefetch(buffer_size=128) ds = ds.make_one_shot_iterator() images, gt_boxes, gt_labels, orig_gt_counts = ds.get_next() if cfg.BACKBONE.DATA_FORMAT == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) # NHWC --> NCHW # build optimizers global_step = tf.train.get_or_create_global_step() learning_rate = warmup_lr_schedule(init_learning_rate=cfg.TRAIN.BASE_LR, global_step=global_step, warmup_step=cfg.TRAIN.WARMUP_STEP) opt = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) sess_config = tf.ConfigProto() sess_config.allow_soft_placement = True sess_config.log_device_placement = False sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) if num_gpus > 1: base_inputs_list = [ tf.split(value, num_or_size_splits=num_gpus, axis=0) for value in [images, gt_boxes, gt_labels, orig_gt_counts] ] tower_grads = [] total_loss_dict = { 'cls_loss': tf.constant(0.), 'reg_loss': tf.constant(0.), 'centerness_loss': tf.constant(0.) } for i, gpu_id in enumerate(gpus): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('model_%d' % gpu_id) as scope: net_inputs = [input[i] for input in base_inputs_list] tower_loss_dict = tower_loss_func(net_inputs, reuse=(gpu_id > 0)) batch_norm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope) tower_loss = tf.add_n( [v for k, v in tower_loss_dict.items()]) for k, v in tower_loss_dict.items(): total_loss_dict[k] += v if i == num_gpus - 1: wd_loss = regularize_cost('.*/kernel', l2_regularizer( cfg.TRAIN.WEIGHT_DECAY), name='wd_cost') tower_loss = tower_loss + wd_loss # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) if cfg.FCOS.VISUALIZATION: with tf.device('/cpu:0'): with tf.name_scope('loss-summaries'): for k, v in tower_loss_dict.items(): summaries.append( tf.summary.scalar(k, v)) grads = opt.compute_gradients(tower_loss) tower_grads.append(grads) grads = average_gradients(tower_grads) for k, v in total_loss_dict.items(): total_loss_dict[k] = v / tf.cast(num_gpus, tf.float32) average_total_loss = tf.add_n([v for k, v in total_loss_dict.items()] + [wd_loss]) else: net_inputs = [images, gt_boxes, gt_labels, orig_gt_counts] tower_loss_dict = tower_loss_func(net_inputs) batch_norm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) wd_loss = regularize_cost('.*/kernel', l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), name='wd_cost') average_total_loss = tf.add_n([v for k, v in tower_loss_dict.items()] + [wd_loss]) grads = opt.compute_gradients(average_total_loss) total_loss_dict = tower_loss_dict summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) if cfg.FCOS.VISUALIZATION: with tf.device('/cpu:0'): with tf.name_scope('loss-summaries'): for k, v in tower_loss_dict.items(): summaries.append(tf.summary.scalar(k, v)) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) summaries.append(tf.summary.scalar('learning_rate', learning_rate)) # add histograms for trainable variables for grad, var in grads: # print(grad, var) if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # add histograms for trainable variables for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) variable_averages = tf.train.ExponentialMovingAverage( cfg.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step) variable_averages_op = variable_averages.apply(tf.trainable_variables()) all_global_vars = [] for var in tf.global_variables(): all_global_vars.append(var.name + '\n') # print(var.name, var.shape) with open('all_global_vars.txt', 'w') as fp: fp.writelines(all_global_vars) all_trainable_vars = [] for var in tf.trainable_variables(): all_trainable_vars.append(var.name + '\n') with open('all_trainable_vars.txt', 'w') as fp: fp.writelines(all_trainable_vars) all_moving_average_vars = [] for var in tf.moving_average_variables(): all_moving_average_vars.append(var.name + '\n') with open('all_moving_average_variables.txt', 'w') as fp: fp.writelines(all_moving_average_vars) # batch norm updates batch_norm_updates_op = tf.group(*batch_norm_updates) with tf.control_dependencies( [apply_gradient_op, variable_averages_op, batch_norm_updates_op]): train_op = tf.no_op(name='train_op') saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge(summaries) summary_writer = tf.summary.FileWriter(checkpoint_path, tf.get_default_graph()) init_op = tf.group( [tf.global_variables_initializer(), tf.local_variables_initializer()]) sess.run(init_op) if False: print('load weights ...') ckpt_params = dict(np.load('MSRA-R50.npz')) assign_ops = [] all_variables = [] for var in tf.global_variables(): dst_name = var.name all_variables.append(dst_name + '\n') if 'resnet50' in dst_name: src_name = dst_name.replace('resnet50/', ''). \ replace('conv2d/kernel:0', 'W') \ .replace('conv2d/bias:0', 'b') \ .replace('batch_normalization/gamma:0', 'gamma') \ .replace('batch_normalization/beta:0', 'beta') \ .replace('batch_normalization/moving_mean:0', 'mean/EMA') \ .replace('batch_normalization/moving_variance:0', 'variance/EMA') \ .replace('kernel:0', 'W').replace('bias:0', 'b') if 'batch_normalization' in dst_name: src_name = src_name.replace('res', 'bn') if 'conv1' in src_name: src_name = 'bn_' + src_name if src_name == 'fc1000/W': print('{} --> {} {}'.format('fc1000/W', dst_name, var.shape)) assign_ops.append( tf.assign( var, np.reshape(ckpt_params[src_name], [2048, 1000]))) continue if src_name in ckpt_params: print('{} --> {} {}'.format(src_name, dst_name, var.shape)) assign_ops.append(tf.assign(var, ckpt_params[src_name])) print('load weights done.') with open('all_vars.txt', 'w') as fp: fp.writelines(all_variables) all_update_ops = [] for op in tf.get_collection(tf.GraphKeys.UPDATE_OPS): all_update_ops.append(op.name + '\n') with open('all_update_ops.txt', 'w') as fp: fp.writelines(all_update_ops) sess.run(assign_ops) else: if False: all_vars = [] restore_var_dict = {} for var in tf.global_variables(): all_vars.append(var.name + '\n') if 'rpn' not in var.name and 'rcnn' not in var.name and 'global_step' not in var.name and \ 'Momentum' not in var.name and 'ExponentialMovingAverage' not in var.name: restore_var_dict[var.name.replace(':0', '')] = var with open('all_vars.txt', 'w') as fp: fp.writelines(all_vars) restorer = tf.train.Saver(var_list=restore_var_dict) restorer.restore(sess, cfg.BACKBONE.CHECKPOINT_PATH) else: if restore_from_original_checkpoint: # restore from official ResNet checkpoint all_vars = [] restore_var_dict = {} for var in tf.global_variables(): all_vars.append(var.name + '\n') if 'rpn' not in var.name and 'rcnn' not in var.name and 'fpn' not in var.name \ and 'fcos' not in var.name \ and 'global_step' not in var.name and \ 'Momentum' not in var.name and 'ExponentialMovingAverage' not in var.name: restore_var_dict[var.name.replace('resnet50/', '').replace( ':0', '')] = var print(var.name, var.shape) with open('all_vars.txt', 'w') as fp: fp.writelines(all_vars) restore_vars_names = [ k + '\n' for k in restore_var_dict.keys() ] with open('all_restore_vars.txt', 'w') as fp: fp.writelines(restore_vars_names) restorer = tf.train.Saver(var_list=restore_var_dict) restorer.restore(sess, cfg.BACKBONE.CHECKPOINT_PATH) else: all_vars = [] restore_var_dict = {} for var in tf.global_variables(): all_vars.append(var.name + '\n') restore_var_dict[var.name.replace(':0', '')] = var with open('all_vars.txt', 'w') as fp: fp.writelines(all_vars) # restore from local checkpoint restorer = tf.train.Saver(tf.global_variables()) try: restorer.restore( sess, tf.train.latest_checkpoint(checkpoint_path)) except: pass # record all ops all_operations = [] for op in sess.graph.get_operations(): all_operations.append(op.name + '\n') with open('all_ops.txt', 'w') as fp: fp.writelines(all_operations) loss_names = ['cls_loss', 'reg_loss', 'centerness_loss'] sess2run = list() sess2run.append(train_op) sess2run.append(learning_rate) sess2run.append(average_total_loss) sess2run.append(wd_loss) sess2run.extend([total_loss_dict[k] for k in loss_names]) print('begin training ...') step = sess.run(global_step) step0 = step start = time.time() for step in range(step, cfg.TRAIN.MAX_STEPS): if step % cfg.TRAIN.SAVE_SUMMARY_STEPS == 0: _, lr_, tl_, wd_loss_, \ cls_loss_, reg_loss_, centerness_loss_, \ summary_str = sess.run(sess2run + [summary_op]) avg_time_per_step = (time.time() - start) / cfg.TRAIN.SAVE_SUMMARY_STEPS avg_examples_per_second = (cfg.TRAIN.SAVE_SUMMARY_STEPS * cfg.TRAIN.BATCH_SIZE_PER_GPU * num_gpus) \ / (time.time() - start) start = time.time() print('Step {:06d}, LR: {:.6f} LOSS: {:.4f}, ' 'CLS: {:.4f}, BOX: {:.4f}, CET: {:.4f}, wd: {:.4f}, ' '{:.2f} s/step, {:.2f} samples/s'.format( step, lr_, tl_, cls_loss_, reg_loss_, centerness_loss_, wd_loss_, avg_time_per_step, avg_examples_per_second)) summary_writer.add_summary(summary_str, global_step=step) else: sess.run(train_op) if step % 1000 == 0: saver.save(sess, checkpoint_path + '/model.ckpt', global_step=step) # profile the graph executation if 1510 <= (step - step0) <= 1520: from tensorflow.python.client import timeline options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('{}/timeline_step{}.json'.format(checkpoint_path, step), 'w') as fp: fp.write(chrome_trace)
def train(dataset): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(0, FLAGS.num_gpus, images) labels_splits = tf.split(0, FLAGS.num_gpus, labels) # Calculate the gradients for each model tower. tower_grads = [] reuse_variables = None for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. with slim.arg_scope([slim.variables.variable], device='/cpu:0'): # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possiblility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, graph_def=sess.graph.as_graph_def(add_shapes=True)) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def inception_model_fn(features, labels, mode, params): """Inception v4 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) features = tensor_transform_fn(features, params['model_transpose_dims']) if FLAGS.clear_update_collections: with arg_scope( inception.inception_v4_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) else: with arg_scope( inception.inception_v4_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=end_points['AuxLogits'], weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) loss = tf.losses.get_total_loss(add_regularization_losses=True) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 # Adjust the initial learning rate for warmup initial_learning_rate /= ( FLAGS.learning_rate_decay**((FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs)) final_learning_rate = 0.0001 * initial_learning_rate train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() cur_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) clr = FLAGS.cold_learning_rate wlr = initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.cold_epochs) learning_rate = tf.where( tf.greater_equal(cur_epoch, FLAGS.cold_epochs), (tf.where( tf.greater_equal(cur_epoch, FLAGS.warmup_epochs + FLAGS.cold_epochs), tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=FLAGS.learning_rate_decay_epochs * batches_per_epoch, decay_rate=FLAGS.learning_rate_decay, staircase=True), tf.multiply(tf.cast(cur_epoch, tf.float32), wlr))), clr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, predictions): accuracy = tf.metrics.accuracy( labels, tf.argmax(input=predictions, axis=1)) return {'accuracy': accuracy} if FLAGS.use_logits: eval_predictions = logits else: eval_predictions = end_points['Predictions'] eval_metrics = (metric_fn, [labels, eval_predictions]) return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metrics=eval_metrics)
def build_graph(self, filenames, labels, subset, feed_hypes=None): hypes = self.hypes.copy() if feed_hypes: with tf.name_scope(None): for i in feed_hypes: hypes[i] = tf.placeholder("float32", name=i) hypes[i].set_shape([]) with tf.name_scope("inputs"): filenames, labels = tf.train.slice_input_producer( tensor_list=[filenames, labels], capacity=hypes["batch_size"] * 2, shuffle=(subset == "train") ) filenames, labels = tf.train.batch( tensor_list=[filenames, labels], capacity=hypes["batch_size"] * 2, batch_size=hypes["batch_size"] ) images0 = [ tf.image.decode_jpeg(tf.read_file(i[0]), channels=3) for i in tf.split(0, hypes["batch_size"], filenames) ] images0 = [skin.util.square_pad(i) for i in images0] if subset == "train": images0 = [tf.image.random_flip_left_right(i) for i in images0] images0 = [tf.image.random_flip_up_down(i) for i in images0] if hypes["spatial_transformer"]: images = skin.util.spatial_tranform( images0, hypes["batch_size"], subset, hypes["loc_net"], hypes["xform_reg"] ) else: images = tf.pack([tf.image.resize_images(i, 299, 299) for i in images0]) with tf.name_scope(None): images = tf.identity(images, name="input") logits, logits_aux = inception_model.inference( images=(images - 128) / 128.0, num_classes=len(self.labels), for_training=(subset == "train"), restore_logits=(subset != "train"), ) with tf.name_scope(None): logits = tf.identity(logits, name="logits") tf.histogram_summary("logits", logits) with tf.name_scope("loss"): batch_size, num_classes = logits.get_shape().as_list() labels_sparse = tf.sparse_to_dense( sparse_indices=tf.transpose(tf.pack([tf.range(batch_size), labels])), output_shape=[batch_size, num_classes], sparse_values=np.ones(batch_size, dtype="float32"), ) loss = tf.nn.softmax_cross_entropy_with_logits(logits, labels_sparse) loss = tf.reduce_mean(loss, name="loss") loss_aux = tf.nn.softmax_cross_entropy_with_logits(logits_aux, labels_sparse) loss_aux = tf.reduce_mean(loss_aux, name="loss_aux") loss = 0.7 * loss + 0.3 * loss_aux tf.scalar_summary("loss", loss) fetches = {"loss": loss, "filenames": filenames, "logits": logits} def print_graph_ops(): with open("/tmp/graph_ops.txt", "w") as f: for op in tf.get_default_graph().get_operations(): f.write(op.type.ljust(35) + "\t" + op.name + "\n") if subset == "train": reg_losses = tf.get_collection("regularization_losses") for i, j in enumerate(reg_losses): if "loc_net" in j.name: reg_losses[i] *= hypes["loc_net_reg"] reg_loss = tf.add_n(reg_losses) tf.scalar_summary("reg_loss", reg_loss) with tf.variable_scope("reg_loss"): loss += reg_loss print_graph_ops() global_step = tf.Variable(0, name="global_step", trainable=False) opt = eval("tf.train.{}Optimizer".format("Adam"))( learning_rate=hypes["learning_rate"], epsilon=hypes["epsilon"], beta1=hypes["beta1"], beta2=hypes["beta2"], ) grads = opt.compute_gradients(loss) apply_grads = opt.apply_gradients(grads, global_step) variable_averages = tf.train.ExponentialMovingAverage(hypes["variable_averages_decay"], global_step) variables_to_average = tf.trainable_variables() + tf.moving_average_variables() variables_averages_op = variable_averages.apply(variables_to_average) batchnorm_updates_op = tf.group(*tf.get_collection("_update_ops_")) train_op = tf.group(apply_grads, variables_averages_op, batchnorm_updates_op) for grad, var in grads: tf.histogram_summary(var.op.name, var) try: tf.histogram_summary(var.op.name + "/gradients", grad) except: print var.op.name fetches.update({"reg_loss": reg_loss, "train_op": train_op, "global_step": global_step}) else: print_graph_ops() return fetches
def train(scope=''): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/gpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) train_dirs = FLAGS.datasets.split(':') # Calculate the learning rate schedule. decay_steps = 15000 # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads _images, _shapes, _reference_shape, pca_model = \ data_provider.load_images(train_dirs) reference_shape = tf.constant(_reference_shape, dtype=tf.float32, name='reference_shape') image_shape = _images[0].shape lms_shape = _shapes[0].points.shape def get_random_sample(rotation_stddev=10): idx = np.random.randint(low=0, high=len(_images)) im = menpo.image.Image(_images[idx].transpose(2, 0, 1), copy=False) lms = _shapes[idx] im.landmarks['PTS'] = lms if np.random.rand() < .5: im = utils.mirror_image(im) if np.random.rand() < .5: theta = np.random.normal(scale=rotation_stddev) rot = menpo.transform.rotate_ccw_about_centre(lms, theta) im = im.warp_to_shape(im.shape, rot) pixels = im.pixels.transpose(1, 2, 0).astype('float32') shape = im.landmarks['PTS'].lms.points.astype('float32') return pixels, shape image, shape = tf.py_func(get_random_sample, [], [tf.float32, tf.float32], stateful=True) initial_shape = data_provider.random_shape(shape, reference_shape, pca_model) image.set_shape(image_shape) shape.set_shape(lms_shape) initial_shape.set_shape(lms_shape) image = data_provider.distort_color(image) images, lms, inits = tf.train.batch([image, shape, initial_shape], FLAGS.batch_size, dynamic_pad=False, capacity=5000, enqueue_many=False, num_threads=num_preprocess_threads, name='batch') print('Defining model...') with tf.device(FLAGS.train_device): # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) predictions, dxs, _ = mdm_model.model( images, inits, patch_shape=(FLAGS.patch_size, FLAGS.patch_size)) total_loss = 0 for i, dx in enumerate(dxs): norm_error = mdm_model.normalized_rmse(dx + inits, lms) tf.summary.histogram('errors', norm_error) loss = tf.reduce_mean(norm_error) total_loss += loss summaries.append(tf.summary.scalar('losses/step_{}'.format(i), loss)) # Calculate the gradients for the batch of data grads = opt.compute_gradients(total_loss) summaries.append(tf.summary.scalar('losses/total', total_loss)) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, predictions], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, lms], [tf.float32]) summary = tf.summary.image('images', tf.concat([gt_images, pred_images], 2), max_outputs=5) summaries.append(tf.summary.histogram('dx', predictions - inits)) summaries.append(summary) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. # NOTE: Currently we are not using batchnorm in MDM. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Start running operations on the Graph. allow_soft_placement must be # set to True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Build an initialization operation to run below. init = tf.initialize_all_variables() print('Initializing variables...') sess.run(init) print('Initialized variables.') if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir) print('Starting training...') for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 20 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 50 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)