def _train_deeplab_model(iterator, num_of_classes, ignore_label): """Trains the deeplab model. Args: iterator: An iterator of type tf.data.Iterator for images and labels. num_of_classes: Number of classes for the dataset. ignore_label: Ignore label for the dataset. Returns: train_tensor: A tensor to update the model variables. summary_op: An operation to log the summaries. """ global_step = tf.train.get_or_create_global_step() learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) tower_losses = [] tower_grads = [] for i in range(FLAGS.num_clones): with tf.device('/gpu:%d' % i): # First tower has default name scope. name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: loss = _tower_loss(iterator=iterator, num_of_classes=num_of_classes, ignore_label=ignore_label, scope=scope, reuse_variable=(i != 0)) tower_losses.append(loss) if FLAGS.quantize_delay_step >= 0: if FLAGS.num_clones > 1: raise ValueError('Quantization doesn\'t support multi-clone yet.') tf.contrib.quantize.create_training_graph( quant_delay=FLAGS.quantize_delay_step) for i in range(FLAGS.num_clones): with tf.device('/gpu:%d' % i): name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: grads = optimizer.compute_gradients(tower_losses[i]) tower_grads.append(grads) with tf.device('/cpu:0'): grads_and_vars = _average_gradients(tower_grads) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = tf.contrib.training.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Gather update_ops. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_ops.append(grad_updates) update_op = tf.group(*update_ops) total_loss = tf.losses.get_total_loss(add_regularization_losses=True) # Print total loss to the terminal. # This implementation is mirrored from tf.slim.summaries. should_log = math_ops.equal(math_ops.mod(global_step, FLAGS.log_steps), 0) total_loss = tf.cond( should_log, lambda: tf.Print(total_loss, [total_loss], 'Total loss is :'), lambda: total_loss) tf.summary.scalar('total_loss', total_loss) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Excludes summaries from towers other than the first one. summary_op = tf.summary.merge_all(scope='(?!clone_)') return train_tensor, summary_op
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # Set up deployment (i.e., multi-GPUs and/or multi-replicas). config = model_deploy.DeploymentConfig(num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.num_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Split the batch across GPUs. assert FLAGS.train_batch_size % config.num_clones == 0, ( 'Training batch size not divisble by number of clones (GPUs).') clone_batch_size = FLAGS.train_batch_size // config.num_clones # Get dataset-dependent information. dataset = segmentation_dataset.get_dataset(FLAGS.dataset, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir) tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Training on %s set', FLAGS.train_split) with tf.Graph().as_default() as graph: with tf.device(config.inputs_device()): samples = input_generator.get( dataset, FLAGS.train_crop_size, clone_batch_size, min_resize_value=FLAGS.min_resize_value, max_resize_value=FLAGS.max_resize_value, resize_factor=FLAGS.resize_factor, min_scale_factor=FLAGS.min_scale_factor, max_scale_factor=FLAGS.max_scale_factor, scale_factor_step_size=FLAGS.scale_factor_step_size, dataset_split=FLAGS.train_split, is_training=True, model_variant=FLAGS.model_variant) inputs_queue = prefetch_queue.prefetch_queue(samples, capacity=128 * config.num_clones) #samples, capacity=12 * config.num_clones) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = tf.train.get_or_create_global_step() # Define the model and create clones. model_fn = _build_unet #model_args = (inputs_queue, { # common.OUTPUT_TYPE: dataset.num_classes #}, dataset.ignore_label) model_args = (inputs_queue, dataset, dataset.ignore_label) clones = model_deploy.create_clones(config, model_fn, args=model_args) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. first_clone_scope = config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) #input('stop!') # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for model variables. for model_var in slim.get_model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for images, labels, semantic predictions if FLAGS.save_summaries_images: summary_image = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.IMAGE)).strip('/')) summaries.add( tf.summary.image('samples/%s' % common.IMAGE, summary_image)) first_clone_label = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.LABEL)).strip('/')) # Scale up summary image pixel values for better visualization. pixel_scaling = max(1, 255 // dataset.num_classes) summary_label = tf.cast(first_clone_label * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.LABEL, summary_label)) first_clone_output = graph.get_tensor_by_name( ('%s/%s:0' % (first_clone_scope, common.OUTPUT_TYPE)).strip('/')) predictions = tf.expand_dims(tf.argmax(first_clone_output, 3), -1) summary_predictions = tf.cast(predictions * pixel_scaling, tf.uint8) summaries.add( tf.summary.image('samples/%s' % common.OUTPUT_TYPE, summary_predictions)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Build the optimizer based on the device specification. with tf.device(config.optimizer_device()): learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) with tf.device(config.variables_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) #input('no training') # Start the training. slim.learning.train(train_tensor, logdir=FLAGS.train_logdir, log_every_n_steps=FLAGS.log_steps, master=FLAGS.master, number_of_steps=FLAGS.training_number_of_steps, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=startup_delay_steps, init_fn=train_utils.get_model_init_fn( FLAGS.train_logdir, FLAGS.tf_initial_checkpoint, FLAGS.initialize_last_layer, last_layers, ignore_missing_vars=True), summary_op=summary_op, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def main(unused_arg): tf.logging.set_verbosity(tf.logging.INFO) # Set up deployment (i.e., multi-GPUs and/or multi-replicas). config = model_deploy.DeploymentConfig(num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.num_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Split the batch across GPUs. assert FLAGS.train_batch_size % config.num_clones == 0, ( 'Training batch size not divisble by number of clones (GPUs).') clone_batch_size = FLAGS.train_batch_size // config.num_clones tf.gfile.MakeDirs(FLAGS.train_dir) with tf.Graph().as_default() as graph: with tf.device(config.inputs_device()): samples, num_samples = get_dataset.get_dataset( FLAGS.dataset, FLAGS.dataset_dir, split_name=FLAGS.train_split, is_training=True, image_size=[FLAGS.image_size, FLAGS.image_size], batch_size=clone_batch_size, channel=FLAGS.input_channel) tf.logging.info('Training on %s set: %d', FLAGS.train_split, num_samples) inputs_queue = prefetch_queue.prefetch_queue(samples, capacity=128 * config.num_clones) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = tf.train.get_or_create_global_step() # Define the model and create clones. model_fn = _build_model model_args = (inputs_queue, clone_batch_size) clones = model_deploy.create_clones(config, model_fn, args=model_args) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. first_clone_scope = config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for model variables. if FLAGS.save_summaries_variables: for model_var in slim.get_model_variables(): summaries.add( tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Build the optimizer based on the device specification. with tf.device(config.optimizer_device()): learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) #optimizer = tf.train.RMSPropOptimizer(learning_rate, momentum=FLAGS.momentum) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps with tf.device(config.variables_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('losses/total_loss', total_loss)) # Modify the gradients for biases and last layer variables. if (FLAGS.dataset == 'protein') and FLAGS.add_counts_logits: last_layers = ['Logits', 'Counts_logits'] else: last_layers = ['Logits'] grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True session_config.gpu_options.per_process_gpu_memory_fraction = 0.9 # Start the training. slim.learning.train(train_tensor, FLAGS.train_dir, is_chief=(FLAGS.task == 0), master=FLAGS.master, graph=graph, log_every_n_steps=FLAGS.log_every_n_steps, session_config=session_config, startup_delay_steps=startup_delay_steps, number_of_steps=FLAGS.number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, init_fn=train_utils.get_model_init_fn( FLAGS.train_dir, FLAGS.fine_tune_checkpoint, FLAGS.initialize_last_layer, last_layers, ignore_missing_vars=True), summary_op=summary_op, saver=tf.train.Saver(max_to_keep=50))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # Set up deployment (i.e., multi-GPUs and/or multi-replicas). config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.num_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Split the batch across GPUs. assert FLAGS.train_batch_size % config.num_clones == 0, ( 'Training batch size not divisble by number of clones (GPUs).') clone_batch_size = FLAGS.train_batch_size / config.num_clones # Get dataset-dependent information. dataset = segmentation_dataset.get_dataset( FLAGS.dataset, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir) tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Training on %s set', FLAGS.train_split) with tf.Graph().as_default(): with tf.device(config.inputs_device()): samples = input_generator.get( dataset, FLAGS.train_crop_size, clone_batch_size, min_resize_value=FLAGS.min_resize_value, max_resize_value=FLAGS.max_resize_value, resize_factor=FLAGS.resize_factor, min_scale_factor=FLAGS.min_scale_factor, max_scale_factor=FLAGS.max_scale_factor, scale_factor_step_size=FLAGS.scale_factor_step_size, dataset_split=FLAGS.train_split, is_training=True, model_variant=FLAGS.model_variant) inputs_queue = prefetch_queue.prefetch_queue( samples, capacity=128 * config.num_clones) # Create the global step on the device storing the variables. with tf.device(config.variables_device()): global_step = tf.train.get_or_create_global_step() # Define the model and create clones. model_fn = _build_deeplab model_args = (inputs_queue, { common.OUTPUT_TYPE: dataset.num_classes }, dataset.ignore_label) clones = model_deploy.create_clones(config, model_fn, args=model_args) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. first_clone_scope = config.clone_scope(0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Add summaries for model variables. for model_var in slim.get_model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Build the optimizer based on the device specification. with tf.device(config.optimizer_device()): learning_rate = train_utils.get_model_learning_rate( FLAGS.learning_policy, FLAGS.base_learning_rate, FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, FLAGS.training_number_of_steps, FLAGS.learning_power, FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) with tf.device(config.variables_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # Modify the gradients for biases and last layer variables. last_layers = model.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) # Start the training. slim.learning.train( train_tensor, logdir=FLAGS.train_logdir, log_every_n_steps=FLAGS.log_steps, master=FLAGS.master, number_of_steps=FLAGS.training_number_of_steps, is_chief=(FLAGS.task == 0), session_config=session_config, startup_delay_steps=startup_delay_steps, init_fn=train_utils.get_model_init_fn( FLAGS.train_logdir, FLAGS.tf_initial_checkpoint, FLAGS.initialize_last_layer, last_layers, ignore_missing_vars=True), summary_op=summary_op, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # test.test(h_w, # FLAGS.num_views, # NUM_GROUP, # FLAGS.num_classes, # FLAGS.batch_size) # test2() # test.test3() # test.test4() SCOPE = "googlenet" dataset = data.Data(FLAGS.dataset_dir, FLAGS.height, FLAGS.weight) tf.gfile.MakeDirs(FLAGS.train_logdir) tf.logging.info('Creating train logdir: %s', FLAGS.train_logdir) with tf.Graph().as_default() as graph: global_step = tf.train.get_or_create_global_step() # Define the model X = tf.placeholder(tf.float32, [None, FLAGS.num_views, FLAGS.height, FLAGS.weight, 3], name='input') ground_truth = tf.placeholder(tf.int64, [None], name='ground_truth') is_training = tf.placeholder(tf.bool) dropout_keep_prob = tf.placeholder(tf.float32) grouping_scheme = tf.placeholder(tf.bool, [NUM_GROUP, FLAGS.num_views]) grouping_weight = tf.placeholder(tf.float32, [NUM_GROUP, 1]) learning_rate = tf.placeholder(tf.float32, [], name="lr") # grouping module d_scores = gvcnn.discrimination_score(X) # GVCNN logits = gvcnn.gvcnn(X, grouping_scheme, grouping_weight, FLAGS.num_classes, is_training, scope=SCOPE, dropout_keep_prob=dropout_keep_prob) # make a trainable variable not trainable train_utils.edit_trainable_variables('fcn') # Define loss tf.losses.sparse_softmax_cross_entropy(labels=ground_truth, logits=logits, scope=SCOPE) # Gather update_ops. These contain, for example, # the updates for the batch_norm variables created by model. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, SCOPE) # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) prediction = tf.argmax(logits, 1, name='prediction') correct_prediction = tf.equal(prediction, ground_truth) confusion_matrix = tf.confusion_matrix( ground_truth, prediction, num_classes=FLAGS.num_classes) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) summaries.add(tf.summary.scalar('accuracy', accuracy)) # Add summaries for model variables. for model_var in slim.get_model_variables(): summaries.add(tf.summary.histogram(model_var.op.name, model_var)) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, SCOPE): # for loss in tf.get_collection(tf.GraphKeys.LOSSES): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # learning_rate = train_utils.get_model_learning_rate( # FLAGS.learning_policy, FLAGS.base_learning_rate, # FLAGS.learning_rate_decay_step, FLAGS.learning_rate_decay_factor, # None, FLAGS.learning_power, # FLAGS.slow_start_step, FLAGS.slow_start_learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) # for variable in slim.get_model_variables(): # summaries.add(tf.summary.histogram(variable.op.name, variable)) total_loss, grads_and_vars = train_utils.optimize(optimizer, scope=SCOPE) # total_loss, grads_and_vars = train_utils.optimize(optimizer) total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.') summaries.add(tf.summary.scalar('total_loss', total_loss)) # Modify the gradients for biases and last layer variables. last_layers = train_utils.get_extra_layer_scopes( FLAGS.last_layers_contain_logits_only) grad_mult = train_utils.get_model_gradient_multipliers( last_layers, FLAGS.last_layer_gradient_multiplier) if grad_mult: grads_and_vars = slim.learning.multiply_gradients( grads_and_vars, grad_mult) # Create gradient update op. grad_updates = optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_op = tf.identity(total_loss, name='train_op') # Add the summaries. These contain the summaries # created by model and either optimize() or _gather_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, SCOPE)) # summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries)) train_writer = tf.summary.FileWriter(FLAGS.summaries_dir, graph) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Create a saver object which will save all the variables # TODO: saver = tf.train.Saver() if FLAGS.tf_initial_checkpoint: saver.restore(sess, FLAGS.tf_initial_checkpoint) # saver = tf.train.Saver( # keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) start_epoch = 0 # Get the number of training/validation steps per epoch t_batches = int(dataset.size() / FLAGS.batch_size) if dataset.size() % FLAGS.batch_size > 0: t_batches += 1 # v_batches = int(dataset.data_size() / FLAGS.batch_size) # if val_data.data_size() % FLAGS.batch_size > 0: # v_batches += 1 ############################ # Training loop. ############################ for training_epoch in range(start_epoch, FLAGS.how_many_training_epochs): print("------------------------") print(" Epoch {} ".format(training_epoch + 1)) print("------------------------") dataset.shuffle_all() for step in range(t_batches): # Pull the image batch we'll use for training. train_batch_xs, train_batch_ys = dataset.next_batch(FLAGS.batch_size) # Verify image # batch_x = tf.unstack(train_batch_xs, axis=0) # for n_batch, vs in enumerate(batch_x): # v_list = tf.unstack(vs, axis=0) # for i, v in enumerate(v_list): # img = v.eval() # # scipy.misc.toimage(img).show() # # Or # img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) # cv2.imwrite('/home/ace19/Pictures/' + str(n_batch) + # '_' + str(i) + '.png', img) # # cv2.imshow(str(train_batch_ys[idx]), img) # cv2.waitKey(200) # cv2.destroyAllWindows() scores = sess.run(d_scores, feed_dict={X: train_batch_xs.eval()}) schemes = gvcnn.grouping_scheme(scores, NUM_GROUP, FLAGS.num_views) weights = gvcnn.grouping_weight(scores, schemes) # Run the graph with this batch of training data. lr, train_summary, train_accuracy, train_loss, _ = \ sess.run([learning_rate, summary_op, accuracy, total_loss, train_op], feed_dict={X: train_batch_xs.eval(), learning_rate:FLAGS.learning_rate, ground_truth: train_batch_ys, grouping_scheme: schemes, grouping_weight: weights, is_training: True, dropout_keep_prob: 0.5}) train_writer.add_summary(train_summary) tf.logging.info('Epoch #%d, Step #%d, rate %.10f, accuracy %.1f%%, loss %f' % (training_epoch, step, lr, train_accuracy * 100, train_loss)) ################################################### # TODO: Validate the model on the validation set ################################################### # Save the model checkpoint periodically. if (training_epoch <= FLAGS.how_many_training_epochs-1): checkpoint_path = os.path.join(FLAGS.train_logdir, 'GVCNN.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_epoch) saver.save(sess, checkpoint_path, global_step=global_step)