def testParallelApplyGrad(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) elems = [ 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0 ] accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems] takeg_t = q.take_grad(1) def apply_grad(accum_op): sess.run(accum_op) threads = [ self.checkedThread(target=apply_grad, args=(o, )) for o in accum_ops ] for thread in threads: thread.start() for thread in threads: thread.join() val = takeg_t.eval() self.assertEqual(val, sum(elems) / len(elems))
def testAccumulatorRepeatedTakeGrad(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) elems = [10.0, 20.0] elems_ave = sum(elems) / len(elems) accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems] takeg_t = q.take_grad(1) for accum_op in accum_ops: accum_op.run() val = takeg_t.eval() self.assertEqual(elems_ave, val) elems = [20.0, 30.0] elems_ave = sum(elems) / len(elems) accum_ops = [q.apply_grad((x, ), local_step=1) for x in elems] takeg_t = q.take_grad(1) for accum_op in accum_ops: accum_op.run() val = takeg_t.eval() self.assertEqual(elems_ave + 0.0, val)
def testParallelTakeGrad(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) elems = [e for e in range(10)] accum_ops = [ q.apply_grad((np.float32(e), ), local_step=e) for e in elems ] takeg_t = q.take_grad(1) def apply_grad(): for accum_op in accum_ops: time.sleep(1.0) sess.run(accum_op) apply_grad_thread = self.checkedThread(target=apply_grad) results = [] def take_grad(): results.append(sess.run(takeg_t)) threads = [self.checkedThread(target=take_grad) for _ in range(10)] for thread in threads: thread.start() apply_grad_thread.start() for thread in threads: thread.join() apply_grad_thread.join() self.assertItemsEqual(elems, results)
def __init__(self, towers, server, aggregation_frequency): """ Args: towers (list[int]): list of GPU ids. server (tf.train.Server): the server with ps and workers. job_name must be 'worker'. """ DataParallelBuilder.__init__(self, towers) DistributedBuilderBase.__init__(self, server) self.is_chief = (self.task_index == 0) worker_prefix = '/job:worker/task:%s' % self.task_index self.param_server_device = tf.train.replica_device_setter( worker_device=worker_prefix + '/cpu:0', cluster=self.cluster) self.nr_gpu = len(self.towers) self.cpu_device = '%s/cpu:0' % worker_prefix self.raw_devices = ['%s/gpu:%i' % (worker_prefix, i) for i in towers] # Device for queues for managing synchronization between servers self.sync_queue_devices = [ '/job:ps/task:%s/cpu:0' % i for i in range(self.num_ps) ] # How often are parameters synchronized self.aggregation_frequency = aggregation_frequency assert self.aggregation_frequency > 0 # This is going to be K x N x 2 data structure holding the queues and vars for aggregated tensors self.gpu_shadow_vars = [] # Used by comm op to know when it can begin reading aggregated values self.counter = tf.ConditionalAccumulator(tf.float32)
def testAccumulatorApplyAndBlockingTake(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) elems = [10.0, 20.0, 30.0] elems_ave = sum(elems) / len(elems) accum_ops = [q.apply_grad((x, ), local_step=0) for x in elems] takeg_t = q.take_grad(3) def apply_grad(): time.sleep(1.0) for accum_op in accum_ops: sess.run(accum_op) return_array = [] def take_grad(): return_array.append(sess.run(takeg_t)) accum_thread = self.checkedThread(target=apply_grad) takeg_thread = self.checkedThread(target=take_grad) accum_thread.start() takeg_thread.start() accum_thread.join() takeg_thread.join() self.assertEqual([elems_ave], return_array)
def testAccumulatorApplyGradFloat32(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) accum_op = q.apply_grad((10.0, )) accum_op.run()
def testAccumulatorSetGlobalStep(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) set_global_step_op = q.set_global_step(1) set_global_step_op.run()
def _optimize(self, loss, acc_count, global_step): ''' :param loss: the network loss :return: a train op, a grad_acc_op ''' optimizer = tf.train.AdamOptimizer(self._init_lr) grads_vars = optimizer.compute_gradients(loss) # create grad accumulator for each variable-grad pair grad_accumulator = {} for idx in range(len(grads_vars)): if grads_vars[idx][0] is not None: grad_accumulator[idx] = tf.ConditionalAccumulator( grads_vars[idx][0].dtype) # apply gradient to each grad accumulator layer_lr = nn.param_lr() grad_accumulator_op = [] for var_idx, grad_acc in grad_accumulator.iteritems(): var_name = str(grads_vars[var_idx][1].name).split(':')[0] var_grad = grads_vars[var_idx][0] grad_accumulator_op.append( grad_acc.apply_grad(var_grad * layer_lr[var_name], local_step=global_step)) # take average gradients for each variable after accumulating count reaches mean_grads_vars = [] for var_idx, grad_acc in grad_accumulator.iteritems(): mean_grads_vars.append( (grad_acc.take_grad(acc_count), grads_vars[var_idx][1])) # apply average gradients to variables update_op = optimizer.apply_gradients(mean_grads_vars, global_step=global_step) return update_op, grad_accumulator_op
def add_optimizer(total_loss, iter_mean_grad, learning_rate, momentum, global_step): with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = {} for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator[ind] = tf.ConditionalAccumulator( grads_and_vars[ind][0].dtype) with tf.name_scope('apply_gradient'): layer_lr = parameter_lr() grad_accumulator_ops = [] for var_ind, grad_acc in grad_accumulator.items(): var_name = str(grads_and_vars[var_ind][1].name).split(':')[0] var_grad = grads_and_vars[var_ind][0] grad_accumulator_ops.append( grad_acc.apply_grad(var_grad * layer_lr[var_name], local_step=global_step)) with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for var_ind, grad_acc in grad_accumulator.items(): mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad), grads_and_vars[var_ind][1])) apply_gradient_op = optimizer.apply_gradients( mean_grads_and_vars, global_step=global_step) return grad_accumulator_ops, apply_gradient_op
def testAccumulatorSizeAfterApplyGradAndTakeGrad(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) accum_op = q.apply_grad((10.0, )) extract_t = q.take_grad(2) # Applying gradient multiple times to increase size from 0 to 2. self.assertEqual(q.num_accumulated().eval(), 0) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 1) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 2) # Extract will reduce size to 0 extract_t.op.run() self.assertEqual(q.num_accumulated().eval(), 0) # Take gradients always sets the size back to 0 if successful. accum_op = q.apply_grad((10.0, ), local_step=1) accum_op.run() accum_op.run() accum_op.run() accum_op.run() self.assertEqual(q.num_accumulated().eval(), 4) extract_t.op.run() self.assertEqual(q.num_accumulated().eval(), 0)
def testAccumulatorApplyGradWithWrongShape(self): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=(3, 2)) with self.assertRaises(ValueError): q.apply_grad([[1.0, 2.0], [3.0, 4.0]]) with self.assertRaises(ValueError): q.apply_grad([[1.0], [2.0], [3.0]])
def testAccumulatorSizeAfterApplyGrad(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) accum_op = q.apply_grad((10.0, )) self.assertEqual(q.num_accumulated().eval(), 0) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 1) accum_op.run() self.assertEqual(q.num_accumulated().eval(), 2)
def testConstructor(self): with tf.Graph().as_default(): q = tf.ConditionalAccumulator(tf.float32, name="Q") self.assertTrue(isinstance(q.accumulator_ref, tf.Tensor)) self.assertProtoEquals( """ name:'Q' op:'ConditionalAccumulator' attr { key: 'dtype' value { type: DT_FLOAT } } attr { key: 'shape' value { shape { unknown_rank: true} } } attr { key: 'container' value { s: '' } } attr { key: 'shared_name' value { s: '' } } """, q.accumulator_ref.op.node_def)
def testAccumulatorMultipleAccumulators(self): with self.test_session(): q_f32_0 = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) q_f32_1 = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) q_f16_0 = tf.ConditionalAccumulator(tf.float16, name="Q", shape=tf.TensorShape([1])) q_f16_1 = tf.ConditionalAccumulator(tf.float16, name="Q", shape=tf.TensorShape([1])) accums = [q_f16_0, q_f16_1, q_f32_0, q_f32_1] for i in range(len(accums)): accums[i].apply_grad((i + 10.0, )).run() for i in range(len(accums)): result = accums[i].take_grad(1).eval() self.assertEqual(result, i + 10.0)
def testAccumulatorInvalidTakeGrad(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) elems = [10.0, 20.0] accum_ops = [q.apply_grad((x, )) for x in elems] takeg_t = q.take_grad(-1) for accum_op in accum_ops: accum_op.run() with self.assertRaises(tf.errors.InvalidArgumentError): takeg_t.eval()
def testDtypes(self): with self.test_session() as sess: dtypes = [tf.float16, tf.float32, tf.float64] for i in range(len(dtypes)): dtype = dtypes[i] q = tf.ConditionalAccumulator(dtype, shape=tf.TensorShape([1])) elems = np.arange(10).astype(dtype.as_numpy_dtype) for e in elems: q.apply_grad((e, )).run() result = sess.run(q.take_grad(1)) self.assertEqual(sum(elems) / len(elems), result)
def testAccumulatorIncrementGlobalStep(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) global_step = tf.Variable(0, name="global_step") new_global_step = tf.add(global_step, 1) inc_global_step = tf.assign(global_step, new_global_step) set_global_step_op = q.set_global_step(new_global_step) tf.initialize_all_variables().run() for _ in range(3): set_global_step_op.run() inc_global_step.eval()
def testAccumulatorCancel(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) takeg_t = q.take_grad(1) takeg_thread = self.checkedThread(self._blocking_takeg, args=(sess, takeg_t)) takeg_thread.start() time.sleep(1.0) sess.close() # Will cancel blocked operation takeg_thread.join()
def testAccumulatorWrongDynamicShape(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=None) x = tf.placeholder(tf.float32) accum_op = q.apply_grad(x) # First successful apply_grad determines shape sess.run(accum_op, feed_dict={x: [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]}) with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(accum_op, feed_dict={x: [[1.0, 2.0], [3.0, 4.0]]}) with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(accum_op, feed_dict={x: [[1.0], [2.0], [3.0]]})
def testConstructorWithShape(self): with tf.Graph().as_default(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1, 5, 2, 8])) self.assertTrue(isinstance(q.accumulator_ref, tf.Tensor)) self.assertProtoEquals( """ name:'Q' op:'ConditionalAccumulator' attr { key: 'dtype' value { type: DT_FLOAT } } attr { key: 'shape' value { shape { dim {size: 1 } dim {size: 5 } dim {size: 2 } dim {size: 8 } } } } attr { key: 'container' value { s: '' } } attr { key: 'shared_name' value { s: '' } } """, q.accumulator_ref.op.node_def)
def testAccumulatorApplyAndTakeGradWithShape(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=(3, 2)) elems = [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]]] elems_ave = [[(a + b) / len(elems) for a, b in zip(x, y)] for x, y in zip(elems[0], elems[1])] accum_ops = [q.apply_grad(x) for x in elems] takeg_t = q.take_grad(1) for accum_op in accum_ops: accum_op.run() is_all_equal = True val = takeg_t.eval() for i in range(len(val)): for j in range(len(val[i])): is_all_equal &= (val[i][j] == elems_ave[i][j]) self.assertTrue(is_all_equal)
def testAccumulatorDynamicShape(self): with self.test_session() as sess: q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=None) x = tf.placeholder(tf.float32) accum_op = q.apply_grad(x) elems = [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]]] elems_ave = [[(a + b) / len(elems) for a, b in zip(c, d)] for c, d in zip(elems[0], elems[1])] takeg_t = q.take_grad(1) for elem in elems: sess.run(accum_op, feed_dict={x: elem}) is_all_equal = True val = takeg_t.eval() for i in range(len(val)): for j in range(len(val[i])): is_all_equal &= (val[i][j] == elems_ave[i][j]) self.assertTrue(is_all_equal)
def testAccumulatorSetGlobalStepPreventsAccumulation(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q", shape=tf.TensorShape([1])) local_steps = range(1000, 1005) accum_ops = [ q.apply_grad((0.0 + x, ), local_step=x) for x in local_steps ] for ls in local_steps: set_global_step_op = q.set_global_step(ls) set_global_step_op.run() for accum_op in accum_ops: accum_op.run() takeg_t = q.take_grad(1) val = takeg_t.eval() self.assertEqual( 0.0 + sum(x for x in local_steps if x >= ls) / sum(1 for x in local_steps if x >= ls), val)
def __init__(self, average=True, compression=None, aggregation_frequency=1): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. " "Uninstall pyarrow and use msgpack instead.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.'))) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) # How often are parameters synchronized self._aggregation_frequency = aggregation_frequency assert self._aggregation_frequency > 0 # This is going to be N x 2 data structure holding the per-GPU aggregated updates and vars # for parameter updates. N is the number of parameters, and there are 2 entries per # parameter because each entry contains the gradient update and the original parameter. self.gpu_shadow_vars = [] # Used by comm_op to know when it can begin reading aggregated values. self.counter = tf.ConditionalAccumulator(tf.float32) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__()
def _train(dataset, valid_dataset, num_classes, initial_ckpt, supervison, learning_rate, logs_path, max_training_iters, save_step, display_step, global_step, iter_mean_grad=1, batch_size=1, momentum=0.9, resume_training=False, config=None, finetune=1, test_image_path=None, ckpt_name="osvos"): """Train OSVOS Args: dataset: Reference to a Dataset object instance initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet) supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object. logs_path: Path to store the checkpoints max_training_iters: Number of training iterations save_step: A checkpoint will be created every save_steps display_step: Information of the training will be displayed every display_steps global_step: Reference to a Variable that keeps track of the training steps iter_mean_grad: Number of gradient computations that are average before updating the weights batch_size: Size of the training batch momentum: Value of the momentum parameter for the Momentum optimizer resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False) config: Reference to a Configuration object used in the creation of a Session finetune: Use to select the type of training, 0 for the parent network and 1 for finetunning test_image_path: If image path provided, every save_step the result of the network with this image is stored Returns: """ model_name = os.path.join(logs_path, ckpt_name + ".ckpt") if config is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.log_device_placement = True config.allow_soft_placement = True tf.logging.set_verbosity(tf.logging.INFO) # Prepare the input data input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3]) # LIAO: image label for classification part image_label = tf.placeholder(tf.float32, [batch_size, num_classes]) # Create the network with slim.arg_scope(osvos_arg_scope()): net, fc, fc7, end_points = osvos(input_image, num_classes) # Define loss with tf.name_scope('losses'): fc = tf.nn.softmax(fc) classification_loss = tf.reduce_sum(tf.pow(fc - image_label, 2)) / (2 * batch_size) correct_pred = tf.equal(tf.argmax(fc, 1), tf.argmax(image_label, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('classification_loss', classification_loss) tf.summary.scalar('accuracy', accuracy) # LIAO: classification loss and l1 loss l2_loss = tf.add_n(tf.losses.get_regularization_losses()) alpha = 0.025 l1_loss = tf.reduce_sum( tf.abs( tf.subtract(tf.abs(fc7), tf.ones([fc7.shape[0], 1, 1, fc7.shape[3] ])))) / batch_size total_loss = classification_loss + l2_loss + alpha * l1_loss tf.summary.scalar('l1_loss', l1_loss) tf.summary.scalar('l2_loss', l2_loss) tf.summary.scalar('total_loss', total_loss) # Define optimization method with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = {} for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator[ind] = tf.ConditionalAccumulator( grads_and_vars[ind][0].dtype) with tf.name_scope('apply_gradient'): grad_accumulator_ops = [] for var_ind, grad_acc in grad_accumulator.iteritems(): var_name = str(grads_and_vars[var_ind][1].name).split(':')[0] var_grad = grads_and_vars[var_ind][0] grad_accumulator_ops.append( grad_acc.apply_grad(var_grad, local_step=global_step)) with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for var_ind, grad_acc in grad_accumulator.iteritems(): mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad), grads_and_vars[var_ind][1])) apply_gradient_op = optimizer.apply_gradients( mean_grads_and_vars, global_step=global_step) # Log training info merged_summary_op = tf.summary.merge_all() # Initialize variables init = tf.global_variables_initializer() # Create objects to record timing and memory of the graph execution # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata # summary_writer.add_run_metadata(run_metadata, 'step%d' % i) with tf.Session(config=config) as sess: print 'Init variable' sess.run(init) test_step = 100 # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) valid_writer = tf.summary.FileWriter(os.path.join(logs_path, 'valid'), graph=tf.get_default_graph()) # Create saver to manage checkpoints saver = tf.train.Saver(max_to_keep=None) last_ckpt_path = tf.train.latest_checkpoint(logs_path) if last_ckpt_path is not None and resume_training: # Load last checkpoint print('Initializing from previous checkpoint...') saver.restore(sess, last_ckpt_path) step = global_step.eval() + 1 else: print('Initializing from specified pre-trained model...') # init_weights(sess) var_list = [] for var in tf.global_variables(): # LIAO: ignore lack of fc if var.name.find('fc') != -1: continue var_type = var.name.split('/')[-1] if 'weights' in var_type or 'bias' in var_type: var_list.append(var) saver_res = tf.train.Saver(var_list=var_list) saver_res.restore(sess, initial_ckpt) step = 1 #sess.run(interp_surgery(tf.global_variables())) print('Weights initialized') print 'Start training' while step < max_training_iters + 1: # Average the gradient for _ in range(0, iter_mean_grad): # LIAO: classification label one-hot encoding batch_image, _, batch_cls_label = dataset.next_batch( batch_size, 'train') for i in range(batch_size): image = batch_image[i] if type(image) is not np.ndarray: image = np.array(Image.open(image), dtype=np.uint8) image = image[:, :, ::-1] image = np.subtract( image, np.array((104.00699, 116.66877, 122.67892), dtype=np.float32)) batch_image[i] = image image = batch_image cls_label = slim.one_hot_encoding( batch_cls_label, num_classes).eval(session=sess) # LIAO: classification label run_res = sess.run([ total_loss, merged_summary_op, classification_loss, accuracy, l1_loss, l2_loss ] + grad_accumulator_ops, feed_dict={ input_image: image, image_label: cls_label }) batch_loss = run_res[0] summary = run_res[1] cls_loss = run_res[2] acc = run_res[3] lloss = run_res[4] l2loss = run_res[5] # Apply the gradients sess.run(apply_gradient_op) # Momentum updates here its statistics # Save summary reports summary_writer.add_summary(summary, step) # Display training status if step % display_step == 0: print >> sys.stderr, "{} Iter {}: Training Loss = {:.4f} l1 loss = {:.4f}, l2 loss = {:.4f}".format( datetime.now(), step, batch_loss, lloss, l2loss) print >> sys.stderr, "\t\tClassification Loss = {:.6f}, accuracy = {:.6f}".format( cls_loss, acc) # LIAO: validation if step % test_step == 0: valid_image, _, valid_cls_label = valid_dataset.next_batch( batch_size, 'train') for i in range(batch_size): image = valid_image[i] if type(image) is not np.ndarray: image = np.array(Image.open(image), dtype=np.uint8) image = image[:, :, ::-1] image = np.subtract( image, np.array((104.00699, 116.66877, 122.67892), dtype=np.float32)) valid_image[i] = image valid_cls_label = slim.one_hot_encoding( valid_cls_label, num_classes).eval(session=sess) valid_res = sess.run([ total_loss, merged_summary_op, classification_loss, accuracy, l1_loss, l2_loss ], feed_dict={ input_image: valid_image, image_label: valid_cls_label }) valid_total_loss = valid_res[0] valid_summary = valid_res[1] valid_cls_loss = valid_res[2] valid_acc = valid_res[3] valid_l1loss = valid_res[4] valid_l2loss = valid_res[5] valid_writer.add_summary(valid_summary, step) print >> sys.stderr, "\n{} ***Test*** {}: Training Loss = {:.4f} l1 loss = {:.4f}, l2 loss = {:.4f} ".format( datetime.now(), step, valid_total_loss, valid_l1loss, valid_l2loss) print >> sys.stderr, "\t\tClassification Loss = {:.6f}, accuracy = {:.6f}".format( valid_cls_loss, valid_acc) print >> sys.stderr, "\t\t===== learning rate: {:.10f} =====\n".format( sess.run(learning_rate)) # Save a checkpoint if step % save_step == 0: if test_image_path is not None: curr_output = sess.run(img_summary, feed_dict={ input_image: preprocess_img(test_image_path) }) summary_writer.add_summary(curr_output, step) save_path = saver.save(sess, model_name, global_step=global_step) print "Model saved in file: %s" % save_path step += 1 if (step - 1) % save_step != 0: save_path = saver.save(sess, model_name, global_step=global_step) print "Model saved in file: %s" % save_path print('Finished training.')
def _train(dataset, initial_ckpt, supervison, learning_rate, logs_path, max_training_iters, save_step, display_step, global_step, iter_mean_grad=1, batch_size=1, momentum=0.9, resume_training=False, config=None, finetune=1, test_image_path=None, ckpt_name="osvos"): """Train OSVOS Args: dataset: Reference to a Dataset object instance initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet) supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object. logs_path: Path to store the checkpoints max_training_iters: Number of training iterations save_step: A checkpoint will be created every save_steps display_step: Information of the training will be displayed every display_steps global_step: Reference to a Variable that keeps track of the training steps iter_mean_grad: Number of gradient computations that are average before updating the weights batch_size: Size of the training batch momentum: Value of the momentum parameter for the Momentum optimizer resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False) config: Reference to a Configuration object used in the creation of a Session finetune: Use to select the type of training, 0 for the parent network and 1 for finetunning test_image_path: If image path provided, every save_step the result of the network with this image is stored Returns: """ model_name = os.path.join(logs_path, ckpt_name + ".ckpt") tf.logging.set_verbosity(tf.logging.INFO) # Prepare the input data input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3]) input_label = tf.placeholder(tf.float32, [batch_size, None, None, 1]) # Create the network with slim.arg_scope(osvos_arg_scope()): net, end_points = osvos(input_image) pass # Initialize weights from pre-trained model init_weights = load_vgg_imagenet(initial_ckpt) if finetune == 0 else None # Define loss with tf.name_scope('losses'): if supervison == 1 or supervison == 2: dsn_2_loss = class_balanced_cross_entropy_loss( end_points['osvos/score-dsn_2-cr'], input_label) dsn_3_loss = class_balanced_cross_entropy_loss( end_points['osvos/score-dsn_3-cr'], input_label) dsn_4_loss = class_balanced_cross_entropy_loss( end_points['osvos/score-dsn_4-cr'], input_label) dsn_5_loss = class_balanced_cross_entropy_loss( end_points['osvos/score-dsn_5-cr'], input_label) tf.summary.scalar('dsn_2_loss', dsn_2_loss) tf.summary.scalar('dsn_3_loss', dsn_3_loss) tf.summary.scalar('dsn_4_loss', dsn_4_loss) tf.summary.scalar('dsn_5_loss', dsn_5_loss) main_loss = class_balanced_cross_entropy_loss(net, input_label) tf.summary.scalar('main_loss', main_loss) if supervison == 1: output_loss = dsn_2_loss + dsn_3_loss + dsn_4_loss + dsn_5_loss + main_loss elif supervison == 2: output_loss = 0.5 * dsn_2_loss + 0.5 * dsn_3_loss + 0.5 * dsn_4_loss + 0.5 * dsn_5_loss + main_loss elif supervison == 3: output_loss = main_loss else: sys.exit( 'Incorrect supervision id, select 1 for supervision of the side outputs, 2 for weak supervision ' 'of the side outputs and 3 for no supervision of the side outputs' ) total_loss = output_loss + tf.add_n( tf.losses.get_regularization_losses()) tf.summary.scalar('total_loss', total_loss) # Define optimization method with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = {} for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator[ind] = tf.ConditionalAccumulator( grads_and_vars[ind][0].dtype) pass with tf.name_scope('apply_gradient'): layer_lr = parameter_lr() grad_accumulator_ops = [] for var_ind, grad_acc in grad_accumulator.items(): var_name = str(grads_and_vars[var_ind][1].name).split(':')[0] var_grad = grads_and_vars[var_ind][0] grad_accumulator_ops.append( grad_acc.apply_grad(var_grad * layer_lr[var_name], local_step=global_step)) pass with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for var_ind, grad_acc in grad_accumulator.items(): mean_grads_and_vars.append((grad_acc.take_grad(iter_mean_grad), grads_and_vars[var_ind][1])) apply_gradient_op = optimizer.apply_gradients( mean_grads_and_vars, global_step=global_step) pass pass # Log training info merged_summary_op = tf.summary.merge_all() # Log evolution of test image img_summary = None if test_image_path is not None: probabilities = tf.nn.sigmoid(net) img_summary = tf.summary.image("Output probabilities", probabilities, max_outputs=1) # Create objects to record timing and memory of the graph execution # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata # summary_writer.add_run_metadata(run_metadata, 'step%d' % i) if config is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.log_device_placement = True config.allow_soft_placement = True pass with tf.Session(config=config) as sess: print('Init variable') sess.run(tf.global_variables_initializer()) # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) # Create saver to manage checkpoints saver = tf.train.Saver(max_to_keep=None) last_ckpt_path = tf.train.latest_checkpoint(logs_path) if last_ckpt_path is not None and resume_training: # Load last checkpoint print('Initializing from previous checkpoint...') saver.restore(sess, last_ckpt_path) step = global_step.eval() + 1 else: # Load pre-trained model if finetune == 0: print('Initializing from pre-trained imagenet model...') init_weights(sess) else: print('Initializing from specified pre-trained model...') var_list = [] for var in tf.global_variables(): var_type = var.name.split('/')[-1] if 'weights' in var_type or 'bias' in var_type: var_list.append(var) saver_res = tf.train.Saver(var_list=var_list) saver_res.restore(sess, initial_ckpt) pass step = 1 sess.run(interp_surgery(tf.global_variables())) print('Weights initialized') print('Start training') while step < max_training_iters + 1: # Average the gradient batch_loss, summary = None, None for _ in range(0, iter_mean_grad): batch_image, batch_label = dataset.next_batch( batch_size, 'train') image = preprocess_img(batch_image[0]) label = preprocess_labels(batch_label[0]) run_res = sess.run([total_loss, merged_summary_op] + grad_accumulator_ops, feed_dict={ input_image: image, input_label: label }) batch_loss = run_res[0] summary = run_res[1] pass # Apply the gradients sess.run(apply_gradient_op) # Momentum updates here its statistics # Save summary reports summary_writer.add_summary(summary, step) # Display training status if step % display_step == 0: print("{} Iter {}: Training Loss = {:.4f}".format( datetime.now(), step, batch_loss)) # Save a checkpoint if step % save_step == 0: if test_image_path is not None: curr_output = sess.run(img_summary, feed_dict={ input_image: preprocess_img(test_image_path) }) summary_writer.add_summary(curr_output, step) pass save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % save_path) pass step += 1 pass if (step - 1) % save_step != 0: save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % save_path) pass print('Finished training.') pass pass
def _train(dataset, initial_ckpt, supervison, learning_rate, logs_path, max_training_iters, save_step, display_step, global_step, number_slices=1, volume=False, iter_mean_grad=1, batch_size=1, task_id=2, loss=1, momentum=0.9, resume_training=False, config=None, finetune=1): """Train network Args: dataset: Reference to a Dataset object instance initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet) supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision learning_rate: Value for the learning rate. It can be number or an instance to a learning rate object. logs_path: Path to store the checkpoints max_training_iters: Number of training iterations save_step: A checkpoint will be created every save_steps display_step: Information of the training will be displayed every display_steps global_step: Reference to a Variable that keeps track of the training steps iter_mean_grad: Number of gradient computations that are average before updating the weights batch_size: momentum: Value of the momentum parameter for the Momentum optimizer resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False) config: Reference to a Configuration object used in the creation of a Session finetune: Use to select to select type of training, 0 for the parent network and 1 for finetunning Returns: """ model_name = os.path.join(logs_path, "seg_liver.ckpt") if config is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.log_device_placement = True config.allow_soft_placement = True tf.logging.set_verbosity(tf.logging.INFO) input_depth = 3 if number_slices > 3: input_depth = number_slices # Prepare the input data input_image = tf.placeholder(tf.float32, [batch_size, None, None, input_depth]) input_label = tf.placeholder(tf.float32, [batch_size, None, None, number_slices]) # Create the network with slim.arg_scope(seg_liver_arg_scope()): net, end_points = seg_liver(input_image, number_slices, volume) # Initialize weights from pre-trained model if finetune == 0: init_weights = load_vgg_imagenet(initial_ckpt, number_slices) # Define loss with tf.name_scope('losses'): dsn_2_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_2-cr'], input_label) tf.summary.scalar('losses/dsn_2_loss', dsn_2_loss) dsn_3_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_3-cr'], input_label) tf.summary.scalar('losses/dsn_3_loss', dsn_3_loss) dsn_4_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_4-cr'], input_label) tf.summary.scalar('losses/dsn_4_loss', dsn_4_loss) dsn_5_loss = class_balanced_cross_entropy_loss(end_points['seg_liver/score-dsn_5-cr'], input_label) tf.summary.scalar('losses/dsn_5_loss', dsn_5_loss) main_loss = class_balanced_cross_entropy_loss(net, input_label) tf.summary.scalar('losses/main_loss', main_loss) if supervison == 1: output_loss = dsn_2_loss + dsn_3_loss + dsn_4_loss + dsn_5_loss + main_loss elif supervison == 2: output_loss = 0.5 * dsn_2_loss + 0.5 * dsn_3_loss + 0.5 * dsn_4_loss + 0.5 * dsn_5_loss + main_loss elif supervison == 3: output_loss = main_loss else: sys.exit('Incorrect supervision id, select 1 for supervision of the side outputs, 2 for weak supervision ' 'of the side outputs and 3 for no supervision of the side outputs') # total_loss = output_loss + tf.add_n(slim.losses.get_regularization_losses()) total_loss = output_loss + tf.add_n(tf.losses.get_regularization_losses()) tf.summary.scalar('losses/total_loss', total_loss) # total_loss = output_loss + 0.001 * tf.add_n(slim.losses.get_regularization_losses()) total_loss = output_loss + 0.001 * tf.add_n(tf.losses.get_regularization_losses()) tf.summary.scalar('losses/total_loss', total_loss) # Define optimization method with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = [] for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator.append(tf.ConditionalAccumulator(grads_and_vars[0][0].dtype)) with tf.name_scope('apply_gradient'): layer_lr = parameter_lr() grad_accumulator_ops = [] for ind in range(0, len(grad_accumulator)): if grads_and_vars[ind][0] is not None: var_name = str(grads_and_vars[ind][1].name).split(':')[0] var_grad = grads_and_vars[ind][0] grad_accumulator_ops.append(grad_accumulator[ind].apply_grad(var_grad * layer_lr[var_name], local_step=global_step)) with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for ind in range(0, len(grad_accumulator)): if grads_and_vars[ind][0] is not None: mean_grads_and_vars.append( (grad_accumulator[ind].take_grad(iter_mean_grad), grads_and_vars[ind][1])) apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step) # Log training info with tf.name_scope('metrics'): dice_coef_op = dice_coef_theoretical(net, input_label) tf.summary.scalar('metrics/dice_coeff', dice_coef_op) merged_summary_op = tf.summary.merge_all() # Initialize variables init = tf.global_variables_initializer() # Create objects to record timing and memory of the graph execution # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Option in the session options=run_options # run_metadata = tf.RunMetadata() # Option in the session run_metadata=run_metadata # summary_writer.add_run_metadata(run_metadata, 'step%d' % i) with tf.Session(config=config) as sess: print 'Init variable' sess.run(init) # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(logs_path + '/train', graph=tf.get_default_graph()) test_writer = tf.summary.FileWriter(logs_path + '/test') # Create saver to manage checkpoints saver = tf.train.Saver(max_to_keep=None) last_ckpt_path = tf.train.latest_checkpoint(logs_path) if last_ckpt_path is not None and resume_training: # Load last checkpoint print('Initializing from previous checkpoint...') saver.restore(sess, last_ckpt_path) step = global_step.eval() + 1 else: # Load pre-trained model if finetune == 0: print('Initializing from pre-trained imagenet model...') init_weights(sess) else: print('Initializing from pre-trained model...') # init_weights(sess) var_list = [] for var in tf.global_variables(): var_type = var.name.split('/')[-1] if 'weights' in var_type or 'bias' in var_type: var_list.append(var) saver_res = tf.train.Saver(var_list=var_list) saver_res.restore(sess, initial_ckpt) step = 1 sess.run(interp_surgery(tf.global_variables())) print('Weights initialized') print 'Start training' while step < max_training_iters + 1: # Average the gradient for iter_steps in range(0, iter_mean_grad): batch_image, batch_label, batch_label_liver = dataset.next_batch(batch_size, 'train') batch_image_val, batch_label_val, batch_label_liver_val = dataset.next_batch(batch_size, 'val') image = preprocess_img(batch_image, number_slices) val_image = preprocess_img(batch_image_val, number_slices) if task_id == 2: batch_label = batch_label_liver batch_label_val = batch_label_liver_val label = preprocess_labels(batch_label, number_slices) label_val = preprocess_labels(batch_label_val, number_slices) run_res = sess.run([total_loss, merged_summary_op, dice_coef_op] + grad_accumulator_ops, feed_dict={input_image: image, input_label: label}) batch_loss = run_res[0] summary = run_res[1] train_dice_coef = run_res[2] if step % display_step == 0: val_run_res = sess.run([total_loss, merged_summary_op, dice_coef_op], feed_dict={input_image: val_image, input_label: label_val}) val_batch_loss = val_run_res[0] val_summary = val_run_res[1] val_dice_coef = val_run_res[2] # Apply the gradients sess.run(apply_gradient_op) # Save summary reports summary_writer.add_summary(summary, step) if step % display_step == 0: test_writer.add_summary(val_summary, step) # Display training status if step % display_step == 0: print >> sys.stderr, "{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss) print >> sys.stderr, "{} Iter {}: Validation Loss = {:.4f}".format(datetime.now(), step, val_batch_loss) print >> sys.stderr, "{} Iter {}: Training Dice = {:.4f}".format(datetime.now(), step, train_dice_coef) print >> sys.stderr, "{} Iter {}: Validation Dice = {:.4f}".format(datetime.now(), step, val_dice_coef) # Save a checkpoint if step % save_step == 0: save_path = saver.save(sess, model_name, global_step=global_step) print "Model saved in file: %s" % save_path step += 1 if (step - 1) % save_step != 0: save_path = saver.save(sess, model_name, global_step=global_step) print "Model saved in file: %s" % save_path print('Finished training.')
def train(dataset, initial_ckpt, learning_rate, logs_path, max_training_iters, save_step, display_step, global_step, iter_mean_grad=1, batch_size=1, momentum=0.9, resume_training=False, config=None, finetune=1): """Train network Args: dataset: Reference to a Dataset object instance initial_ckpt: Path to the checkpoint to initialize the network (May be parent network or pre-trained Imagenet) supervison: Level of the side outputs supervision: 1-Strong 2-Weak 3-No supervision learning_rate: Value for the learning rate. It can be number or an instance to a learning rate object. logs_path: Path to store the checkpoints max_training_iters: Number of training iterations save_step: A checkpoint will be created every save_steps display_step: Information of the training will be displayed every display_steps global_step: Reference to a Variable that keeps track of the training steps iter_mean_grad: Number of gradient computations that are average before updating the weights batch_size: momentum: Value of the momentum parameter for the Momentum optimizer resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False) config: Reference to a Configuration object used in the creation of a Session finetune: Use to select to select type of training, 0 for the parent network and 1 for finetunning Returns: """ model_name = os.path.join(logs_path, "det_lesion.ckpt") if config is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True tf.logging.set_verbosity(tf.logging.INFO) # Prepare the input data input_image = tf.placeholder(tf.float32, [batch_size, 80, 80, 3]) input_label = tf.placeholder(tf.float32, [batch_size]) is_training = tf.placeholder(tf.bool, shape=()) tf.summary.histogram('input_label', input_label) # Create the network with slim.arg_scope(det_lesion_arg_scope()): net, end_points = det_lesion_resnet(input_image, is_training_option=is_training) # Initialize weights from pre-trained model if finetune == 0: init_weights = load_resnet_imagenet(initial_ckpt) # Define loss with tf.name_scope('losses'): loss, output, target = binary_cross_entropy(net, input_label) total_loss = loss + tf.add_n(tf.losses.get_regularization_losses()) tf.summary.scalar('losses/total_loss', total_loss) tf.summary.histogram('losses/output', output) tf.summary.histogram('losses/target', target) # Define optimization method with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) #optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = [] for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator.append(tf.ConditionalAccumulator(grads_and_vars[0][0].dtype)) with tf.name_scope('apply_gradient'): grad_accumulator_ops = [] for ind in range(0, len(grad_accumulator)): if grads_and_vars[ind][0] is not None: var_name = str(grads_and_vars[ind][1].name).split(':')[0] var_grad = grads_and_vars[ind][0] if "weights" in var_name: aux_layer_lr = 1.0 elif "biases" in var_name: aux_layer_lr = 2.0 grad_accumulator_ops.append(grad_accumulator[ind].apply_grad(var_grad*aux_layer_lr, local_step=global_step)) with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for ind in range(0, len(grad_accumulator)): if grads_and_vars[ind][0] is not None: mean_grads_and_vars.append((grad_accumulator[ind].take_grad(iter_mean_grad), grads_and_vars[ind][1])) apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step) with tf.name_scope('metrics'): acc_op = my_accuracy(net, input_label) tf.summary.scalar('metrics/accuracy', acc_op) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: tf.logging.info('Gathering update_ops') with tf.control_dependencies(tf.tuple(update_ops)): total_loss = tf.identity(total_loss) merged_summary_op = tf.summary.merge_all() # Initialize variables init = tf.global_variables_initializer() with tf.Session(config=config) as sess: print('Init variable') sess.run(init) # op to write logs to Tensorboard logs_path_train = os.path.join(logs_path,'train') logs_path_test = os.path.join(logs_path,'test') #summary_writer = tf.summary.FileWriter(logs_path + '/train', graph=tf.get_default_graph()) #test_writer = tf.summary.FileWriter(logs_path + '/test') summary_writer = tf.summary.FileWriter(logs_path_train, graph=tf.get_default_graph()) test_writer = tf.summary.FileWriter(logs_path_test) # Create saver to manage checkpoints saver = tf.train.Saver(max_to_keep=None) last_ckpt_path = tf.train.latest_checkpoint(logs_path) if last_ckpt_path is not None and resume_training: # Load last checkpoint print('Initializing from previous checkpoint...') saver.restore(sess, last_ckpt_path) step = global_step.eval() + 1 else: # Load pre-trained model if finetune == 0: print('Initializing from pre-trained imagenet model...') init_weights(sess) else: print('Initializing from pre-trained model...') # init_weights(sess) var_list = [] for var in tf.global_variables(): var_type = var.name.split('/')[-1] if 'weights' in var_type or 'bias' in var_type: var_list.append(var) saver_res = tf.train.Saver(var_list=var_list) saver_res.restore(sess, initial_ckpt) step = 1 sess.run(interp_surgery(tf.global_variables())) print('Weights initialized') print('Start training') while step < max_training_iters + 1: # Average the gradient for iter_steps in range(0, iter_mean_grad): batch_image, batch_label, x_bb_train, y_bb_train, ids_train = dataset.next_batch(batch_size, 'train', 0.5) batch_image_val, batch_label_val, x_bb_val, y_bb_val, ids_val = dataset.next_batch(batch_size, 'val', 0.5) image = preprocess_img(batch_image, x_bb_train, y_bb_train, ids_train) label = batch_label val_image = preprocess_img(batch_image_val, x_bb_val, y_bb_val) label_val = batch_label_val run_res = sess.run([total_loss, merged_summary_op, acc_op] + grad_accumulator_ops, feed_dict={input_image: image, input_label: label, is_training: True}) batch_loss = run_res[0] summary = run_res[1] acc = run_res[2] if step % display_step == 0: val_run_res = sess.run([total_loss, merged_summary_op, acc_op], feed_dict={input_image: val_image, input_label: label_val, is_training: False}) val_batch_loss = val_run_res[0] val_summary = val_run_res[1] val_acc = val_run_res[2] # Apply the gradients sess.run(apply_gradient_op) # Save summary reports summary_writer.add_summary(summary, step) if step % display_step == 0: test_writer.add_summary(val_summary, step) # Display training status if step % display_step == 0: print("{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss, file=sys.stderr)) print("{} Iter {}: Validation Loss = {:.4f}".format(datetime.now(), step, val_batch_loss, file=sys.stderr)) print("{} Iter {}: Training Accuracy = {:.4f}".format(datetime.now(), step, acc, file=sys.stderr)) print("{} Iter {}: Validation Accuracy = {:.4f}".format(datetime.now(), step, val_acc, file=sys.stderr)) # Save a checkpoint if step % save_step == 0: save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % (save_path)) step += 1 if (step-1) % save_step != 0: save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % (save_path)) print('Finished training.')
def testAccumulatorSizeEmpty(self): with self.test_session(): q = tf.ConditionalAccumulator(tf.float32, name="Q") self.assertEqual(q.num_accumulated().eval(), 0)
def train_finetune(dataset, model_params, learning_rate, logs_path, max_training_iters, save_step, display_step, global_step, iter_mean_grad=1, batch_size=1, resume_training=False, config=None, use_image_summary=True, ckpt_name="osmn"): """Train OSMN Args: dataset: Reference to a Dataset object instance model_params: Model parameters initial_ckpt: Path to the checkpoint to initialize the whole network or visual modulator, depend on seg_ckpt seg_ckpt: If seg_ckpt is not None, initial_ckpt is used to initialize the visual modulator, and seg_ckpt is used to initialize segmentation network learning_rate: Value for the learning rate. It can be a number or an instance to a learning rate object. logs_path: Path to store the checkpoints max_training_iters: Number of training iterations save_step: A checkpoint will be created every save_steps display_step: Information of the training will be displayed every display_steps global_step: Reference to a Variable that keeps track of the training steps iter_mean_grad: Number of gradient computations that are average before updating the weights batch_size: Size of the training batch resume_training: Boolean to try to restore from a previous checkpoint (True) or not (False) config: Reference to a Configuration object used in the creation of a Session use_image_summary: Boolean to use image summary during training in tensorboard ckpt_name: checkpoint name for saving Returns: """ model_name = os.path.join(logs_path, ckpt_name+".ckpt") if config is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True # config.log_device_placement = True config.allow_soft_placement = True tf.logging.set_verbosity(tf.logging.INFO) # Prepare the input data guide_image = tf.placeholder(tf.float32, [batch_size, 224, 224, 3]) input_image = tf.placeholder(tf.float32, [batch_size, None, None, 3]) gb_image = tf.placeholder(tf.float32, [batch_size, None, None, 1]) input_label = tf.placeholder(tf.float32, [batch_size, None, None, 1]) model_func = get_model_func(model_params.base_model) net, end_points = model_func([guide_image, gb_image, input_image], model_params, is_training=True) # Define loss with tf.name_scope('losses'): main_loss = class_balanced_cross_entropy_loss(net, input_label) tf.summary.scalar('main_loss', main_loss) total_loss = main_loss + tf.add_n(tf.losses.get_regularization_losses()) tf.summary.scalar('total_loss', total_loss) # Define optimization method with tf.name_scope('optimization'): tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(total_loss) with tf.name_scope('grad_accumulator'): grad_accumulator = {} for ind in range(0, len(grads_and_vars)): if grads_and_vars[ind][0] is not None: grad_accumulator[ind] = tf.ConditionalAccumulator(grads_and_vars[ind][0].dtype) with tf.name_scope('apply_gradient'): grad_accumulator_ops = [] for var_ind, grad_acc in grad_accumulator.items(): var_name = str(grads_and_vars[var_ind][1].name).split(':')[0] var_grad = grads_and_vars[var_ind][0] grad_accumulator_ops.append(grad_acc.apply_grad(var_grad, local_step=global_step)) with tf.name_scope('take_gradients'): mean_grads_and_vars = [] for var_ind, grad_acc in grad_accumulator.items(): mean_grads_and_vars.append( (grad_acc.take_grad(iter_mean_grad), grads_and_vars[var_ind][1])) apply_gradient_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step) # Log training info merged_summary_op = tf.summary.merge_all() # Log results on training images if use_image_summary: probabilities = tf.nn.sigmoid(net) input_image_orig = input_image / model_params.scale_value + model_params.mean_value guide_image_orig = guide_image / model_params.scale_value + model_params.mean_value img_summary = binary_seg_summary(input_image_orig, probabilities, gb_image, input_label) vg_summary = visual_guide_summary(guide_image_orig) # Initialize variables init = tf.global_variables_initializer() with tf.Session(config=config) as sess: print('Init variable') sess.run(init) tvars = tf.trainable_variables() # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph()) # Create saver to manage checkpoints saver = tf.train.Saver(max_to_keep=40) last_ckpt_path = tf.train.latest_checkpoint(logs_path) if last_ckpt_path is not None and resume_training: # Load last checkpoint print('Initializing from previous checkpoint...') saver.restore(sess, last_ckpt_path) step = global_step.eval() + 1 elif model_params.whole_model_path == '': print('Initializing from pre-trained imagenet model...') if model_params.use_visual_modulator: load_model(model_params.vis_mod_model_path, 'osmn/modulator')(sess) if model_params.seg_model_path != '': load_model(model_params.seg_model_path, 'osmn/seg')(sess) step = 1 else: print('Initializing from pre-trained model...') load_model(model_params.whole_model_path, 'osmn')(sess) step = 1 #if model_params.base_model != 'lite': sess.run(interp_surgery(tf.global_variables())) print('Weights initialized') print('Start training') while step < max_training_iters + 1: # Average the gradient for _ in range(0, iter_mean_grad): batch_g_image, batch_gb_image, batch_image, batch_label = dataset.next_batch(batch_size, 'train') run_res = sess.run([total_loss, merged_summary_op] + grad_accumulator_ops, feed_dict={guide_image: batch_g_image, gb_image: batch_gb_image, input_image: batch_image, input_label: batch_label}) batch_loss = run_res[0] summary = run_res[1] # Apply the gradients sess.run(apply_gradient_op) # Momentum updates here its statistics # Save summary reports summary_writer.add_summary(summary, step) # Display training status if step % display_step == 0: if use_image_summary: #test_g_image, test_gb_image, test_image, _ = dataset.next_batch(batch_size, 'test') curr_img_summary = sess.run([img_summary, vg_summary], feed_dict={guide_image:batch_g_image, gb_image:batch_gb_image, input_image: batch_image, input_label: batch_label}) for s in curr_img_summary: summary_writer.add_summary(s, step) print("{} Iter {}: Training Loss = {:.4f}".format(datetime.now(), step, batch_loss),file=sys.stderr) # Save a checkpoint if step % save_step == 0: save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % save_path) step += 1 if (step - 1) % save_step != 0: save_path = saver.save(sess, model_name, global_step=global_step) print("Model saved in file: %s" % save_path) print('Finished training.')