def imagenet_input(is_training): """Data reader for imagenet. Reads in imagenet data and performs pre-processing on the images. Args: is_training: bool specifying if train or validation dataset is needed. Returns: A batch of images and labels. """ if is_training: dataset = dataset_factory.get_dataset('imagenet', 'train', FLAGS.dataset_dir) else: dataset = dataset_factory.get_dataset('imagenet', 'validation', FLAGS.dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=is_training, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) image_preprocessing_fn = preprocessing_factory.get_preprocessing( 'mobilenet_v1', is_training=is_training) image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size) images, labels = tf.train.batch(tensors=[image, label], batch_size=FLAGS.batch_size, num_threads=4, capacity=5 * FLAGS.batch_size) return images, labels
def provide_data(split_name, batch_size, dataset_dir, dataset_name='imagenet', num_readers=1, num_threads=1, patch_size=128): """Provides batches of image data for compression. Args: split_name: Either 'train' or 'validation'. batch_size: The number of images in each batch. dataset_dir: The directory where the data can be found. If `None`, use default. dataset_name: Name of the dataset. num_readers: Number of dataset readers. num_threads: Number of prefetching threads. patch_size: Size of the path to extract from the image. Returns: images: A `Tensor` of size [batch_size, patch_size, patch_size, channels] """ randomize = split_name == 'train' dataset = datasets.get_dataset(dataset_name, split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=num_readers, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=randomize) [image] = provider.get(['image']) # Sample a patch of fixed size. patch = tf.image.resize_image_with_crop_or_pad(image, patch_size, patch_size) patch.shape.assert_is_compatible_with([patch_size, patch_size, 3]) # Preprocess the images. Make the range lie in a strictly smaller range than # [-1, 1], so that network outputs aren't forced to the extreme ranges. patch = (tf.to_float(patch) - 128.0) / 142.0 if randomize: image_batch = tf.train.shuffle_batch([patch], batch_size=batch_size, num_threads=num_threads, capacity=5 * batch_size, min_after_dequeue=batch_size) else: image_batch = tf.train.batch( [patch], batch_size=batch_size, num_threads=1, # no threads so it's deterministic capacity=5 * batch_size) return image_batch
def provide_data(batch_size, dataset_dir, dataset_name='cifar10', split_name='train', one_hot=True): """Provides batches of CIFAR data. Args: batch_size: The number of images in each batch. dataset_dir: The directory where the CIFAR10 data can be found. If `None`, use default. dataset_name: Name of the dataset. split_name: Should be either 'train' or 'test'. one_hot: Output one hot vector instead of int32 label. Returns: images: A `Tensor` of size [batch_size, 32, 32, 3]. Output pixel values are in [-1, 1]. labels: Either (1) one_hot_labels if `one_hot` is `True` A `Tensor` of size [batch_size, num_classes], where each row has a single element set to one and the rest set to zeros. Or (2) labels if `one_hot` is `False` A `Tensor` of size [batch_size], holding the labels as integers. num_samples: The number of total samples in the dataset. num_classes: The number of classes in the dataset. Raises: ValueError: if the split_name is not either 'train' or 'test'. """ dataset = datasets.get_dataset(dataset_name, split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=(split_name == 'train')) [image, label] = provider.get(['image', 'label']) # Preprocess the images. image = (tf.to_float(image) - 128.0) / 128.0 # Creates a QueueRunner for the pre-fetching operation. images, labels = tf.train.batch([image, label], batch_size=batch_size, num_threads=32, capacity=5 * batch_size) labels = tf.reshape(labels, [-1]) if one_hot: labels = tf.one_hot(labels, dataset.num_classes) return images, labels, dataset.num_samples, dataset.num_classes
def provide_data(split_name, batch_size, dataset_dir, dataset_name='imagenet', num_readers=1, num_threads=1, patch_size=128): """Provides batches of image data for compression. Args: split_name: Either 'train' or 'validation'. batch_size: The number of images in each batch. dataset_dir: The directory where the data can be found. If `None`, use default. dataset_name: Name of the dataset. num_readers: Number of dataset readers. num_threads: Number of prefetching threads. patch_size: Size of the path to extract from the image. Returns: images: A `Tensor` of size [batch_size, patch_size, patch_size, channels] """ randomize = split_name == 'train' dataset = datasets.get_dataset( dataset_name, split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=num_readers, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=randomize) [image] = provider.get(['image']) # Sample a patch of fixed size. patch = tf.image.resize_image_with_crop_or_pad(image, patch_size, patch_size) patch.shape.assert_is_compatible_with([patch_size, patch_size, 3]) # Preprocess the images. Make the range lie in a strictly smaller range than # [-1, 1], so that network outputs aren't forced to the extreme ranges. patch = (tf.to_float(patch) - 128.0) / 142.0 if randomize: image_batch = tf.train.shuffle_batch( [patch], batch_size=batch_size, num_threads=num_threads, capacity=5 * batch_size, min_after_dequeue=batch_size) else: image_batch = tf.train.batch( [patch], batch_size=batch_size, num_threads=1, # no threads so it's deterministic capacity=5 * batch_size) return image_batch
def provide_data(batch_size, dataset_dir, dataset_name='cifar10', split_name='train', one_hot=True): """Provides batches of CIFAR data. Args: batch_size: The number of images in each batch. dataset_dir: The directory where the CIFAR10 data can be found. If `None`, use default. dataset_name: Name of the dataset. split_name: Should be either 'train' or 'test'. one_hot: Output one hot vector instead of int32 label. Returns: images: A `Tensor` of size [batch_size, 32, 32, 3]. Output pixel values are in [-1, 1]. labels: Either (1) one_hot_labels if `one_hot` is `True` A `Tensor` of size [batch_size, num_classes], where each row has a single element set to one and the rest set to zeros. Or (2) labels if `one_hot` is `False` A `Tensor` of size [batch_size], holding the labels as integers. num_samples: The number of total samples in the dataset. num_classes: The number of classes in the dataset. Raises: ValueError: if the split_name is not either 'train' or 'test'. """ dataset = datasets.get_dataset( dataset_name, split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=(split_name == 'train')) [image, label] = provider.get(['image', 'label']) # Preprocess the images. image = (tf.to_float(image) - 128.0) / 128.0 # Creates a QueueRunner for the pre-fetching operation. images, labels = tf.train.batch( [image, label], batch_size=batch_size, num_threads=32, capacity=5 * batch_size) labels = tf.reshape(labels, [-1]) if one_hot: labels = tf.one_hot(labels, dataset.num_classes) return images, labels, dataset.num_samples, dataset.num_classes
def test_dataset_factory(self): train_set = factory.get_dataset('bot', 'train', BOT_PROTOBUF_DIR) validation_set = factory.get_dataset('bot', 'validation', BOT_PROTOBUF_DIR) self.assertTrue(train_set) self.assertTrue(type(train_set) is tf_slim.dataset.Dataset) self.assertEqual(train_set.num_classes, 5) self.assertEqual(train_set.num_samples, 3320) self.assertTrue(validation_set) self.assertTrue(type(validation_set) is tf_slim.dataset.Dataset) self.assertEqual(validation_set.num_classes, 5) self.assertEqual(validation_set.num_samples, 350) bmw_models_bot_id = 'bmw_models' bmw_model_protobuf = dirs.get_protobuf_dir(bmw_models_bot_id) train_set = factory.get_dataset('bot', 'train', bmw_model_protobuf) validation_set = factory.get_dataset('bot', 'validation', bmw_model_protobuf) exp_num_classes = utils.get_number_of_classes_by_labels( bmw_model_protobuf) exp_train_set_size = utils.get_split_size(bmw_models_bot_id, 'train') exp_val_set_size = utils.get_split_size(bmw_models_bot_id, 'validation') self.assertTrue(train_set) self.assertTrue(type(train_set) is tf_slim.dataset.Dataset) self.assertEqual(train_set.num_classes, exp_num_classes) self.assertEqual(train_set.num_samples, exp_train_set_size) self.assertTrue(validation_set) self.assertTrue(type(validation_set) is tf_slim.dataset.Dataset) self.assertEqual(validation_set.num_classes, exp_num_classes) self.assertEqual(validation_set.num_samples, exp_val_set_size)
def provide_data(split_name, batch_size, dataset_dir, num_readers=1, num_threads=1): """Provides batches of MNIST digits. Args: split_name: Either 'train' or 'test'. batch_size: The number of images in each batch. dataset_dir: The directory where the MNIST data can be found. num_readers: Number of dataset readers. num_threads: Number of prefetching threads. Returns: images: A `Tensor` of size [batch_size, 28, 28, 1] one_hot_labels: A `Tensor` of size [batch_size, mnist.NUM_CLASSES], where each row has a single element set to one and the rest set to zeros. num_samples: The number of total samples in the dataset. Raises: ValueError: If `split_name` is not either 'train' or 'test'. """ dataset = datasets.get_dataset('mnist', split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=num_readers, common_queue_capacity=2 * batch_size, common_queue_min=batch_size, shuffle=(split_name == 'train')) [image, label] = provider.get(['image', 'label']) # Preprocess the images. image = (tf.to_float(image)) / 255.0 # image = 2.0 * image - 1.0 # image = tf.reshape(image, [-1]) # Creates a QueueRunner for the pre-fetching operation. images, labels = tf.train.batch([image, label], batch_size=batch_size, num_threads=num_threads, capacity=5 * batch_size) one_hot_labels = tf.one_hot(labels, dataset.num_classes) return images, one_hot_labels, dataset.num_samples
def provide_data(dataset_name='cifar10', split_name='train', dataset_dir, batch_size=32, shuffle=True, num_threads=1, patch_height=32, patch_width=32, colors=3): """Provides a batch of image data from predefined dataset. Args: dataset_name: A string of dataset name. Defaults to 'cifar10'. split_name: Either 'train' or 'validation'. Defaults to 'train'. dataset_dir: The directory where the data can be found. If `None`, use default. batch_size: The number of images in each minibatch. Defaults to 32. shuffle: Whether to shuffle the read images. Defaults to True. num_threads: Number of prefetching threads. Defaults to 1. patch_height: A Python integer. The read images height. Defaults to 32. patch_width: A Python integer. The read images width. Defaults to 32. colors: Number of channels. Defaults to 3. Returns: A float `Tensor`s with shape [batch_size, patch_height, patch_width, colors] representing a batch of images. """ dataset = datasets.get_dataset( dataset_name, split_name, dataset_dir=dataset_dir) provider = tf.contrib.slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=shuffle) return batch_images( image=normalize_image(provider.get(['image'])[0]), patch_height=patch_height, patch_width=patch_width, colors=colors, batch_size=batch_size, shuffle=shuffle, num_threads=num_threads)
def provide_data(dataset_name='cifar10', split_name='train', dataset_dir, batch_size=32, shuffle=True, num_threads=1, patch_height=32, patch_width=32, colors=3): """Provides a batch of image data from predefined dataset. Args: dataset_name: A string of dataset name. Defaults to 'cifar10'. split_name: Either 'train' or 'validation'. Defaults to 'train'. dataset_dir: The directory where the data can be found. If `None`, use default. batch_size: The number of images in each minibatch. Defaults to 32. shuffle: Whether to shuffle the read images. Defaults to True. num_threads: Number of prefetching threads. Defaults to 1. patch_height: A Python integer. The read images height. Defaults to 32. patch_width: A Python integer. The read images width. Defaults to 32. colors: Number of channels. Defaults to 3. Returns: A float `Tensor`s with shape [batch_size, patch_height, patch_width, colors] representing a batch of images. """ dataset = datasets.get_dataset( dataset_name, split_name, dataset_dir=dataset_dir) provider = tf.contrib.slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=1, common_queue_capacity=5 * batch_size, common_queue_min=batch_size, shuffle=shuffle) return batch_images( image=normalize_image(provider.get(['image'])[0]), patch_height=patch_height, patch_width=patch_width, colors=colors, batch_size=batch_size, shuffle=shuffle, num_threads=num_threads)
def provide_data(split_name, batch_size, dataset_dir, num_readers=1, num_threads=1): """Provides batches of MNIST digits. Args: split_name: Either 'train' or 'test'. batch_size: The number of images in each batch. dataset_dir: The directory where the MNIST data can be found. num_readers: Number of dataset readers. num_threads: Number of prefetching threads. Returns: images: A `Tensor` of size [batch_size, 28, 28, 1] one_hot_labels: A `Tensor` of size [batch_size, mnist.NUM_CLASSES], where each row has a single element set to one and the rest set to zeros. num_samples: The number of total samples in the dataset. Raises: ValueError: If `split_name` is not either 'train' or 'test'. """ dataset = datasets.get_dataset('mnist', split_name, dataset_dir=dataset_dir) provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=num_readers, common_queue_capacity=2 * batch_size, common_queue_min=batch_size, shuffle=(split_name == 'train')) [image, label] = provider.get(['image', 'label']) # Preprocess the images. image = (tf.to_float(image) - 128.0) / 128.0 # Creates a QueueRunner for the pre-fetching operation. images, labels = tf.train.batch( [image, label], batch_size=batch_size, num_threads=num_threads, capacity=5 * batch_size) one_hot_labels = tf.one_hot(labels, dataset.num_classes) return images, one_hot_labels, dataset.num_samples
def main(_): if not FLAGS.output_file: raise ValueError( 'You must supply the path to save to with --output_file') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as graph: dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train', FLAGS.dataset_dir) network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=FLAGS.is_training) image_size = FLAGS.image_size or network_fn.default_image_size placeholder = tf.placeholder( name='input', dtype=tf.float32, shape=[FLAGS.batch_size, image_size, image_size, 3]) network_fn(placeholder) graph_def = graph.as_graph_def() with gfile.GFile(FLAGS.output_file, 'wb') as f: f.write(graph_def.SerializeToString())
def eval_model(candidate, N, F, save_dir, model_name): print("eval model") tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, 'test', FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, candidate, N, F, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) FLAGS.batch_size = 100 images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), # 'Recall_5': slim.metrics.streaming_recall_at_k( # logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) FLAGS.checkpoint_path = FLAGS.train_dir if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) final_op = [names_to_values['Accuracy']] #top1 accuracy to return config = tf.ConfigProto() config.gpu_options.allow_growth = True #time.sleep(60) pl.start() start_time = time.time() a = slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, session_config=config, num_evals=num_batches, eval_op=list(names_to_updates.values()), final_op = final_op, variables_to_restore=variables_to_restore) duration = time.time() - start_time pl.stop() data_list = pl.getDataTrace(nodeName='module/gpu', valType='power') pickle.dump(data_list, open(os.path.join(save_dir, model_name + '_data_list_final_{}_{}.pkl'.format(N,F)),'wb')) power_list = data_list[1] time_list = data_list[0] start, end = get_start_end(power_list) integration_time = time_list[end] - time_list[start] integration_energy = integrate_power(power_list, time_list, start, end) return integration_time, integration_energy
def train_model(candidate, N, F): print("train model") print(FLAGS.dataset_name) if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, candidate, N, F, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, total_num_replicas=FLAGS.worker_replicas, variable_averages=variable_averages, variables_to_average=moving_average_variables) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### logits, _ = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)
def run_transfer_learning(root_model_dir, bot_model_dir, protobuf_dir, model_name='inception_v4', dataset_split_name='train', dataset_name='bot', checkpoint_exclude_scopes=None, trainable_scopes=None, max_train_time_sec=None, max_number_of_steps=None, log_every_n_steps=None, save_summaries_secs=None, optimization_params=None): """ Starts the transfer learning of a model in a tensorflow session :param root_model_dir: Directory containing the root models pretrained checkpoint files :param bot_model_dir: Directory where the transfer learned model's checkpoint files are written to :param protobuf_dir: Directory for the dataset factory to load the bot's training data from :param model_name: name of the network model for the net factory to provide the correct network and preprocesing fn :param dataset_split_name: 'train' or 'validation' :param dataset_name: triggers the dataset factory to load a bot dataset :param checkpoint_exclude_scopes: Layers to exclude when restoring the models variables :param trainable_scopes: Layers to train from the restored model :param max_train_time_sec: time boundary to stop training after in seconds :param max_number_of_steps: maximum number of steps to run :param log_every_n_steps: write a log after every nth optimization step :param save_summaries_secs: save summaries to disc every n seconds :param optimization_params: parameters for the optimization :return: """ if not optimization_params: optimization_params = OPTIMIZATION_PARAMS if not max_number_of_steps: max_number_of_steps = _MAX_NUMBER_OF_STEPS if not checkpoint_exclude_scopes: checkpoint_exclude_scopes = _CHECKPOINT_EXCLUDE_SCOPES if not trainable_scopes: trainable_scopes = _TRAINABLE_SCOPES if not max_train_time_sec: max_train_time_sec = _MAX_TRAIN_TIME_SECONDS if not log_every_n_steps: log_every_n_steps = _LOG_EVERY_N_STEPS if not save_summaries_secs: save_summaries_secs = _SAVE_SUMMARRIES_SECS tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ####################### # Config model_deploy # ####################### deploy_config = model_deploy.DeploymentConfig( num_clones=_NUM_CLONES, clone_on_cpu=_CLONE_ON_CPU, replica_id=_TASK, num_replicas=_WORKER_REPLICAS, num_ps_tasks=_NUM_PS_TASKS) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( dataset_name, dataset_split_name, protobuf_dir) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn( model_name, num_classes=(dataset.num_classes - _LABELS_OFFSET), weight_decay=OPTIMIZATION_PARAMS['weight_decay'], is_training=True, dropout_keep_prob=OPTIMIZATION_PARAMS['dropout_keep_prob']) ##################################### # Select the preprocessing function # ##################################### image_preprocessing_fn = preprocessing_factory.get_preprocessing( model_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=_NUM_READERS, common_queue_capacity=20 * _BATCH_SIZE, common_queue_min=10 * _BATCH_SIZE) [image, label] = provider.get(['image', 'label']) label -= _LABELS_OFFSET train_image_size = network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=_BATCH_SIZE, num_threads=_NUM_PREPROCESSING_THREADS, capacity=5 * _BATCH_SIZE) labels = slim.one_hot_encoding( labels, dataset.num_classes - _LABELS_OFFSET) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy( logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=_LABEL_SMOOTHING, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=_LABEL_SMOOTHING, weights=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.summary.histogram('activations/' + end_point, x)) summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.summary.histogram(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if OPTIMIZATION_PARAMS['moving_average_decay']: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( OPTIMIZATION_PARAMS['moving_average_decay'], global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add(tf.summary.scalar('learning_rate', learning_rate)) if _SYNC_REPLICAS: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=_REPLICAS_TO_AGGREGATE, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(_TASK, tf.int32, shape=()), total_num_replicas=_WORKER_REPLICAS) elif OPTIMIZATION_PARAMS['moving_average_decay']: # Update ops executed locally by trainer. update_ops.append(variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train(trainable_scopes) # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=bot_model_dir, train_step_fn=train_step, # Manually added a custom train step to stop after max_time train_step_kwargs=_train_step_kwargs(logdir=bot_model_dir, max_train_time_seconds=max_train_time_sec), master=_MASTER, is_chief=(_TASK == 0), init_fn=_get_init_fn(root_model_dir, bot_model_dir, checkpoint_exclude_scopes), summary_op=summary_op, # number_of_steps=max_number_of_steps, log_every_n_steps=log_every_n_steps, save_summaries_secs=save_summaries_secs, save_interval_secs=_SAVE_INTERNAL_SECS, sync_optimizer=optimizer if _SYNC_REPLICAS else None)
def eval(bot_id, bot_suffix='', setting_id=None, validation_setting=2, dataset_split='validation', dataset_name='bot', model_name='inception_v4', preprocessing=None, moving_average_decay=None, tf_master=''): full_id = bot_id + bot_suffix if setting_id: protobuf_dir = dirs.get_transfer_proto_dir(bot_id, validation_setting) model_dir = dirs.get_transfer_model_dir(full_id, setting_id) else: protobuf_dir = dirs.get_protobuf_dir(bot_id) model_dir = dirs.get_model_data_dir(full_id) _check_dir(protobuf_dir) _check_dir(model_dir) print("READIND FROM %s AND %s" % (protobuf_dir, model_dir)) performance_data_dir = dirs.get_performance_data_dir(bot_id) # if os.listdir(performance_data_dir): # raise ValueError('%s is not empty' % performance_data_dir) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(dataset_name, dataset_split, protobuf_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( model_name, num_classes=(dataset.num_classes - LABELS_OFFSET), is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * BATCH_SIZE, common_queue_min=BATCH_SIZE) [image, label] = provider.get(['image', 'label']) label -= LABELS_OFFSET ##################################### # Select the preprocessing function # ##################################### preprocessing_name = preprocessing or model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = EVAL_IMAGE_SIZE or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) images, labels = tf.train.batch([image, label], batch_size=BATCH_SIZE, num_threads=NUM_THREADS, capacity=5 * BATCH_SIZE) #################### # Define the model # #################### logits, _ = network_fn(images) if moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall_5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.items(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if MAX_NUM_BATCHES: num_batches = MAX_NUM_BATCHES else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(BATCH_SIZE)) if tf.gfile.IsDirectory(model_dir): checkpoint_path = tf.train.latest_checkpoint(model_dir) else: checkpoint_path = model_dir tf.logging.info('Evaluating %s' % checkpoint_path) slim.evaluation.evaluate_once( master=tf_master, checkpoint_path=checkpoint_path, logdir=performance_data_dir, num_evals=num_batches, eval_op=list(names_to_updates.values()), variables_to_restore=variables_to_restore)