def eval_one_epoch(args, sess, dataset, image_paths_placeholder, labels_placeholder, is_training_placeholder, enqueue_op, clones): batch_size = args.batch_size * args.num_gpus image_paths, num_per_class = all_val_entities(args, dataset) print('eval image paths', len(image_paths)) nrof_origin_samples = len(image_paths) assert (sum(num_per_class) == nrof_origin_samples) #print(num_per_class) #print(image_paths[0:10]) assert (args.batch_size % 3 == 0) triplet_size = args.batch_size // 3 _a = int(math.ceil(len(image_paths) / batch_size)) nrof_samples = _a * batch_size while nrof_samples > len(image_paths): image_paths.append(image_paths[0]) #for p in image_paths: # print(p) #print('Running forward pass on sampled images: ', end='') start_time = time.time() nrof_examples = len(image_paths) assert (nrof_examples % batch_size == 0) labels_array = np.reshape(np.arange(nrof_examples), (-1, 3)) image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1), (-1, 3)) print(image_paths_array.shape) print(labels_array.shape) sess.run( enqueue_op, { image_paths_placeholder: image_paths_array, labels_placeholder: labels_array }) emb_array = np.zeros((nrof_examples, args.embedding_size), dtype=np.float32) nrof_batches = int(np.ceil(nrof_examples / batch_size)) print('eval batches', nrof_batches) for i in xrange(nrof_batches): if i % 10 == 0: prt('running eval batch %d' % i) ops = [] for clone in clones: with tf.device(clone.device): embeddings, labels, _ = clone.outputs ops += [embeddings, labels] ops_value = sess.run(ops, feed_dict={is_training_placeholder: False}) for k in xrange(args.num_gpus): emb = ops_value[k * 2] #prt(emb.shape) lab = ops_value[k * 2 + 1] #prt(lab.shape) emb_array[lab, :] = emb sys.stdout.flush() print('%.3f' % (time.time() - start_time)) emb_array = emb_array[0:nrof_origin_samples, :] score = top1_recall(emb_array, num_per_class) print('top1 recall: %f' % score)
def do_training(train_op, init_fn=None, summary_op=None, lr=None): global savers graph = ops.get_default_graph() with graph.as_default(): global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(max_to_keep=0) with ops.name_scope('init_ops'): init_op = tf_variables.global_variables_initializer() ready_op = tf_variables.report_uninitialized_variables() local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) summary_writer = supervisor.Supervisor.USE_DEFAULT with ops.name_scope('train_step'): train_step_kwargs = {} if not FLAGS.max_number_of_steps is None: should_stop_op = math_ops.greater_equal( global_step, FLAGS.max_number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if FLAGS.log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, FLAGS.log_every_n_steps), 0) prefix = "loc/net" lp = len(prefix) vdic = { "InceptionV2" + v.op.name[lp:]: v for v in tf.trainable_variables() if v.name.startswith(prefix) and v.name.find("Logits/") < 0 } _saver = tf_saver.Saver(vdic) savers.append(_saver) for i in xrange(NUM_STN): prefix = "stn%d/net" % i lp = len(prefix) vdic = { "InceptionV2" + v.op.name[lp:]: v for v in tf.trainable_variables() if v.name.startswith(prefix) and v.name.find("Logits/") < 0 } # saver = tf.train.Saver(vdic) _saver = tf_saver.Saver(vdic) savers.append(_saver) prt("savers %d" % len(savers)) is_chief = True logdir = FLAGS.train_dir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=None, local_init_op=local_init_op, ready_for_local_init_op=None, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=FLAGS.save_summaries_secs, save_model_secs=FLAGS.save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer with sv.managed_session('', start_standard_services=False, config=None) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') try: while not sv.should_stop(): total_loss, global_step_value, should_stop = train_step( sess, train_op, global_step, lr, train_step_kwargs) current_epoch = int( math.ceil(float(global_step_value) / FLAGS.steps_in_epoch)) if global_step_value > 0 and global_step_value % FLAGS.save_every_n_steps == 0: sv.saver.save(sess, sv.save_path, global_step=sv.global_step) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
def train_step(sess, train_op, global_step, lr, train_step_kwargs): """Function that takes a gradient step and specifies whether to stop. Args: sess: The current session. train_op: An `Operation` that evaluates the gradients and returns the total loss. global_step: A `Tensor` representing the global training step. train_step_kwargs: A dictionary of keyword arguments. Returns: The total loss and a boolean indicating whether or not to stop training. Raises: ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not. """ start_time = time.time() trace_run_options = None run_metadata = None if 'should_trace' in train_step_kwargs: if 'logdir' not in train_step_kwargs: raise ValueError( 'logdir must be present in train_step_kwargs when ' 'should_trace is present') if sess.run(train_step_kwargs['should_trace']): trace_run_options = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() total_loss, lr_value, np_global_step = sess.run( [train_op, lr, global_step], options=trace_run_options, run_metadata=run_metadata) time_elapsed = time.time() - start_time if run_metadata is not None: tl = timeline.Timeline(run_metadata.step_stats) trace = tl.generate_chrome_trace_format() trace_filename = os.path.join(train_step_kwargs['logdir'], 'tf_trace-%d.json' % np_global_step) logging.info('Writing trace to %s', trace_filename) file_io.write_string_to_file(trace_filename, trace) if 'summary_writer' in train_step_kwargs: train_step_kwargs['summary_writer'].add_run_metadata( run_metadata, 'run_metadata-%d' % np_global_step) if 'should_log' in train_step_kwargs: if sess.run(train_step_kwargs['should_log']): logging.info('global step %d: loss = %.4f (%.3f sec/step)', np_global_step, total_loss, time_elapsed) prt('global step %d with lr %.4f: loss = %.4f (%.3f sec/step)' % (np_global_step, lr_value, total_loss, time_elapsed)) # TODO(nsilberman): figure out why we can't put this into sess.run. The # issue right now is that the stop check depends on the global step. The # increment of global step often happens via the train op, which used # created using optimizer.apply_gradients. # # Since running `train_op` causes the global step to be incremented, one # would expected that using a control dependency would allow the # should_stop check to be run in the same session.run call: # # with ops.control_dependencies([train_op]): # should_stop_op = ... # # However, this actually seems not to work on certain platforms. if 'should_stop' in train_step_kwargs: should_stop = sess.run(train_step_kwargs['should_stop']) else: should_stop = False return total_loss, np_global_step, should_stop
from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables as tf_variables from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import summary from tensorflow.python.training import optimizer as tf_optimizer from tensorflow.python.training import saver as tf_saver from tensorflow.python.training import supervisor from tensorflow.python.training import sync_replicas_optimizer from tensorflow.python.training import training_util slim = tf.contrib.slim cuda_devices = os.environ['CUDA_VISIBLE_DEVICES'] NUM_GPUS = len(cuda_devices.split(',')) prt("NUM_GPUS %d" % NUM_GPUS) NUM_CLASSES = 120 NUM_ATTRIBS = 10654 BATCH_PER_GPU = 16 # assert BATCH_SIZE%NUM_GPUS==0 SAVE_EVERY_N_EPOCH = 2 DEFAULT_IMAGE_SIZE = 448 IMAGE_SIZE = DEFAULT_IMAGE_SIZE if len(sys.argv) > 1: IMAGE_SIZE = int(sys.argv[1]) STN_OUT_SIZE = 224 prt("IMAGE_SIZE %d" % IMAGE_SIZE) INIT_LR = 0.01 LOC_LR = 0.00001
def train_one_epoch(args, sess, dataset, image_paths_placeholder, labels_placeholder, is_training_placeholder, enqueue_op, input_queue, clones, loss, train_op, summary_op, summary_writer): global_step = variables.get_or_create_global_step() step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size batch_number = 0 lr = args.learning_rate batch_size = args.batch_size * args.num_gpus while batch_number < args.epoch_size: # Sample people randomly from the dataset prt('start to sample entities') image_paths, num_per_class = sample_entities(args, dataset) #print(num_per_class[0:5]) #prt(len(image_paths)) #print(num_per_class) #print(image_paths[0:10]) #print('Running forward pass on sampled images: ', end='') start_time = time.time() nrof_examples = len(image_paths) assert (nrof_examples % batch_size == 0) labels_array = np.reshape(np.arange(nrof_examples), (-1, 3)) image_paths_array = np.reshape( np.expand_dims(np.array(image_paths), 1), (-1, 3)) #print(image_paths_array.shape) #print(labels_array.shape) sess.run( enqueue_op, { image_paths_placeholder: image_paths_array, labels_placeholder: labels_array }) emb_array = np.zeros((nrof_examples, args.embedding_size)) nrof_batches = int(np.ceil(nrof_examples / args.batch_size)) embeddings = clones[0].outputs[0] label_batch = clones[0].outputs[1] #print(nrof_batches) for i in xrange(nrof_batches): emb, lab = sess.run([embeddings, label_batch], feed_dict={is_training_placeholder: True}) emb_array[lab, :] = emb print('time for fetching all embedding %.3f' % (time.time() - start_time)) #print(emb_array[0:5,0:5]) # Select triplets based on the embeddings print('Selecting suitable triplets for training') triplets, triplets_info = select_triplets(args, emb_array, num_per_class, image_paths) selection_time = time.time() - start_time print( '(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' % (0, len(triplets), selection_time)) assert len(triplets) > 0 #post-processing assert (args.batch_size % 3 == 0) triplet_size = batch_size // 3 _a = len(triplets) // triplet_size nrof_triplets = _a * triplet_size triplets = triplets[0:nrof_triplets] #post-processing finish # Perform training on the selected triplets triplet_paths = list(itertools.chain(*triplets)) nrof_batches = int(np.ceil(nrof_triplets * 3 / batch_size)) labels_array = np.reshape(np.arange(len(triplet_paths)), (-1, 3)) triplet_paths_array = np.reshape( np.expand_dims(np.array(triplet_paths), 1), (-1, 3)) sess.run( enqueue_op, { image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array }) nrof_examples = len(triplet_paths) train_time = 0 i = 0 emb_array = np.zeros((nrof_examples, args.embedding_size)) #loss_array = np.zeros((nrof_triplets,)) prt('nrof_batches: %d' % nrof_batches) while i < nrof_batches: start_time = time.time() #err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict={is_training_placeholder:True}) #emb_array[lab,:] = emb #loss_array[i] = err err, _, step = sess.run([loss, train_op, global_step], feed_dict={is_training_placeholder: True}) duration = time.time() - start_time prt('Epoch: [%d][%d@%d/%d]\tTime %.3f\tLoss %2.3f' % (epoch, i, batch_number + 1, args.epoch_size, duration, err)) batch_number += 1 i += 1 train_time += duration prt('one sample finish') # Add validation loss and accuracy to summary summary = tf.Summary() #pylint: disable=maybe-no-member summary.value.add(tag='time/selection', simple_value=selection_time) summary_writer.add_summary(summary, step) return step
def main(): print(args) prt('') subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir( log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir( model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) np.random.seed(seed=args.seed) print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) if args.pretrained_model: print('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model)) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=args.num_gpus, clone_on_cpu=False) tf.set_random_seed(args.seed) #global_step = tf.Variable(0, trainable=False) global_step = variables.get_or_create_global_step() # Placeholder for the learning rate #learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') #batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size') with tf.device('/cpu:0'): is_training_placeholder = tf.placeholder(tf.bool, name='is_training') image_paths_placeholder = tf.placeholder(tf.string, shape=(None, 3), name='image_paths') labels_placeholder = tf.placeholder(tf.int64, shape=(None, 3), name='labels') input_queue = data_flow_ops.FIFOQueue(capacity=100000, dtypes=[tf.string, tf.int64], shapes=[(3, ), (3, )], shared_name=None, name=None) enqueue_op = input_queue.enqueue_many( [image_paths_placeholder, labels_placeholder]) nrof_preprocess_threads = 8 images_and_labels = [] for _ in range(nrof_preprocess_threads): filenames, label = input_queue.dequeue() #filenames = tf.Print(filenames, [tf.shape(filenames)], 'filenames shape:') images = [] for filename in tf.unstack(filenames): #filename = tf.Print(filename, [filename], 'filename = ') file_contents = tf.read_file(filename) image = tf.image.decode_jpeg(file_contents) #image = tf.Print(image, [tf.shape(image)], 'data count = ') if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) if args.random_crop: #image = tf.random_crop(image, [args.image_size, args.image_size, 3]) bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]) sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=bbox, area_range=(0.7, 1.0), use_image_if_no_bounding_boxes=True) bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box image = tf.slice(image, bbox_begin, bbox_size) #else: # image = tf.image.resize_image_with_crop_or_pad(image, args.image_size, args.image_size) image = tf.expand_dims(image, 0) image = tf.image.resize_bilinear( image, [args.image_size, args.image_size], align_corners=False) image = tf.squeeze(image, [0]) if args.random_flip: image = tf.image.random_flip_left_right(image) image.set_shape((args.image_size, args.image_size, 3)) ##pylint: disable=no-member image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) #image = tf.Print(image, [tf.shape(image)], 'data count = ') images.append(image) #images.append(tf.image.per_image_standardization(image)) images_and_labels.append([images, label]) learning_rate = get_learning_rate(args) opt = get_optimizer(args, learning_rate) image_batch, label_batch = tf.train.batch_join( images_and_labels, batch_size=args.batch_size, shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True, capacity=4 * nrof_preprocess_threads * args.batch_size, allow_smaller_final_batch=False) batch_queue = slim.prefetch_queue.prefetch_queue( [image_batch, label_batch], capacity=9000) def clone_fn(_batch_queue): _image_batch, _label_batch = _batch_queue.dequeue() embeddings = image_to_embedding(_image_batch, is_training_placeholder, args) # Split embeddings into anchor, positive and negative and calculate triplet loss anchor, positive, negative = tf.unstack( tf.reshape(embeddings, [-1, 3, args.embedding_size]), 3, 1) triplet_loss = triplet_loss_fn(anchor, positive, negative, args.alpha) tf.losses.add_loss(triplet_loss) #tf.summary.scalar('learning_rate', learning_rate) return embeddings, _label_batch, triplet_loss clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone = clones[0] triplet_loss = first_clone.outputs[2] embeddings = first_clone.outputs[0] _label_batch = first_clone.outputs[1] #embedding_clones = model_deploy.create_clones(deploy_config, embedding_fn, [batch_queue]) #first_clone_scope = deploy_config.clone_scope(0) #update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) update_ops = [] vdic = [ v for v in tf.trainable_variables() if v.name.find("Logits/") < 0 ] pretrained_saver = tf.train.Saver(vdic) saver = tf.train.Saver(max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() with tf.device(deploy_config.optimizer_device()): learning_rate = get_learning_rate(args) opt = get_optimizer(args, learning_rate) total_loss, clones_gradients = model_deploy.optimize_clones( clones, opt, var_list=tf.trainable_variables()) grad_updates = opt.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_op = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') vdic = [ v for v in tf.trainable_variables() if v.name.find("Logits/") < 0 ] pretrained_saver = tf.train.Saver(vdic) saver = tf.train.Saver(max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = tf.Session() # Initialize variables sess.run(tf.global_variables_initializer(), feed_dict={is_training_placeholder: True}) sess.run(tf.local_variables_initializer(), feed_dict={is_training_placeholder: True}) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) with sess.as_default(): if args.pretrained_model: print('Restoring pretrained model: %s' % args.pretrained_model) pretrained_saver.restore( sess, os.path.expanduser(args.pretrained_model)) # Training and validation loop epoch = 0 while epoch < args.max_nrof_epochs: eval_one_epoch(args, sess, dataset, image_paths_placeholder, labels_placeholder, is_training_placeholder, enqueue_op, clones) # Train for one epoch train_one_epoch(args, sess, dataset, image_paths_placeholder, labels_placeholder, is_training_placeholder, enqueue_op, input_queue, clones, total_loss, train_op, summary_op, summary_writer) # Save variables and the metagraph if it doesn't exist already global_step = variables.get_or_create_global_step() step = sess.run(global_step, feed_dict=None) print('one epoch finish', step) save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) print('saver finish') sess.close() return model_dir
self.val_key_list = self.key_list[train_count:] #def train_key_list(): # return self.train_key_list #def val_key_list(): # return self.val_key_list def get_contents(self, car_id): return self.data_map[car_id] NUM_GPUS = 0 dataset = AutoDataset() dataset.split_train_val(0.7, SEED) prt("dataset loaded %d %d" % (len(dataset.train_key_list), len(dataset.val_key_list))) BASE_EPOCH_SIZE = 10000 def parse_arguments(argv): parser = argparse.ArgumentParser() parser.add_argument('--logs_base_dir', type=str, help='Directory where to write event logs.', default='./logs') parser.add_argument( '--models_base_dir', type=str, help='Directory where to write trained models and checkpoints.', default='./models')