def _evaluate(self, session, step): var_name_to_value = session.run(self._var_name_to_train_var) logging.info('Building placeholders.') placeholder_to_value = { self._var_name_to_placeholder[v_name]: var_name_to_value[v_name] for v_name in var_name_to_value } def feed_variables(scaffold, session): del scaffold session.run(self._var_feed_op, feed_dict=placeholder_to_value) logging.info('Building scaffold.') scaffold = training.Scaffold( init_fn=feed_variables, copy_from_scaffold=self._scaffold) with self._graph.as_default(): mlperf_log.resnet_print(key=mlperf_log.EVAL_START) eval_results = self._estimator._evaluate_run( checkpoint_path=None, scaffold=scaffold, update_op=self._update_op, eval_dict=self._eval_dict, all_hooks=self._all_hooks, output_dir=self._eval_dir) logging.info('Eval done.') self._timer.update_last_triggered_step(step) return eval_results
def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer, parse_record_fn, num_epochs=1, num_gpus=None, examples_per_epoch=None, dtype=tf.float32): """Given a Dataset with raw records, return an iterator over the records. Args: dataset: A Dataset representing raw records is_training: A boolean denoting whether the input is for training. batch_size: The number of samples per batch. shuffle_buffer: The buffer size to use when shuffling records. A larger value results in better randomness, but smaller values reduce startup time and use less memory. parse_record_fn: A function that takes a raw record and returns the corresponding (image, label) pair. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. examples_per_epoch: The number of examples in an epoch. dtype: Data type to use for images/features. Returns: Dataset of (image, label) pairs ready for iteration. """ # We prefetch a batch at a time, This can help smooth out the time taken to # load input files as we go through shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size) if is_training: # Shuffle the records. Note that we shuffle before repeating to ensure # that the shuffling respects epoch boundaries. mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER) dataset = dataset.shuffle(buffer_size=shuffle_buffer) # If we are training over multiple epochs before evaluating, repeat the # dataset for the appropriate number of epochs. dataset = dataset.repeat(num_epochs) # Parse the raw records into images and labels. Testing has shown that setting # num_parallel_batches > 1 produces no improvement in throughput, since # batch_size is almost always much greater than the number of CPU cores. dataset = dataset.apply( tf.contrib.data.map_and_batch( lambda value: parse_record_fn(value, is_training, dtype), batch_size=batch_size, num_parallel_batches=1)) # Operations between the final prefetch and the get_next call to the iterator # will happen synchronously during run time. We prefetch here again to # background all of the above processing work and keep it out of the # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE # allows DistributionStrategies to adjust how many batches to fetch based # on how many devices are present. dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset
def block_group(inputs, filters, block_fn, blocks, strides, is_training, name, data_format='channels_first'): """Creates one group of blocks for the ResNet model. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first convolution of the layer. block_fn: `function` for the block to use within the model blocks: `int` number of blocks contained in the layer. strides: `int` stride to use for the first convolution of the layer. If greater than 1, this layer will downsample the input. is_training: `bool` for whether the model is training. name: `str`name for the Tensor output of the block layer. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: The output `Tensor` of the block layer. """ # Drop batch size from shape logging. if is_training and FLAGS.mlperf_logging: mlperf_log.resnet_print( key=mlperf_log.MODEL_HP_INITIAL_SHAPE, value=inputs.shape.as_list()[1:]) # Only the first block per block_group uses projection shortcut and strides. inputs = block_fn(inputs, filters, is_training, strides, use_projection=True, data_format=data_format) for _ in range(1, blocks): inputs = block_fn(inputs, filters, is_training, 1, data_format=data_format) return tf.identity(inputs, name)
def learning_rate_schedule(current_epoch): """Handles linear scaling rule, gradual warmup, and LR decay. The learning rate starts at 0, then it increases linearly per step. After 5 epochs we reach the base learning rate (scaled to account for batch size). After 30, 60 and 80 epochs the learning rate is divided by 10. After 90 epochs training stops and the LR is set to 0. This ensures that we train for exactly 90 epochs for reproducibility. Args: current_epoch: `Tensor` for current epoch. Returns: A scaled `Tensor` for current learning rate. """ mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) decay_rate = (scaled_lr * LR_SCHEDULE[0][0] * current_epoch / LR_SCHEDULE[0][1]) for mult, start_epoch in LR_SCHEDULE: decay_rate = tf.where(current_epoch < start_epoch, decay_rate, scaled_lr * mult) return decay_rate
def log_batch_norm(input_tensor, output_tensor, momentum, epsilon, center, scale, training): assert _get_shape(input_tensor) == _get_shape(output_tensor) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_BATCH_NORM, value={ "shape": _get_shape(input_tensor), "momentum": momentum, "epsilon": epsilon, "center": center, "scale": scale, "training": training}, stack_offset=_STACK_OFFSET)
def _mean_image_subtraction(image, means, num_channels): """Subtracts the given means from each image channel. For example: means = [123.68, 116.779, 103.939] image = _mean_image_subtraction(image, means) Note that the rank of `image` must be known. Args: image: a tensor of size [height, width, C]. means: a C-vector of values to subtract from each channel. num_channels: number of color channels in the image that will be distorted. Returns: the centered image. Raises: ValueError: If the rank of `image` is unknown, if `image` has a rank other than three or if the number of channels in `image` doesn't match the number of values in `means`. """ if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') mlperf_log.resnet_print(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=means) # We have a 1-D tensor of means; convert to 3-D. means = tf.expand_dims(tf.expand_dims(means, 0), 0) return image - means
def log_begin_block(input_tensor, block_type): mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_BEGIN_BLOCK, value={"block_type": block_type}, stack_offset=_STACK_OFFSET) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RESNET_TOPOLOGY, value=" Block Input: {}".format( _get_shape(input_tensor)), stack_offset=_STACK_OFFSET)
def model(inputs, is_training): """Creation of the model graph.""" inputs = conv2d_fixed_padding( inputs=inputs, filters=64, kernel_size=7, strides=2, is_training=is_training, data_format=data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = batch_norm_relu(inputs, is_training, data_format=data_format) pooled_inputs = tf.layers.max_pooling2d( inputs=inputs, pool_size=3, strides=2, padding='SAME', data_format=data_format) if is_training and FLAGS.mlperf_logging: resnet_log_helper.log_max_pool(input_tensor=inputs, output_tensor=pooled_inputs) inputs = tf.identity(pooled_inputs, 'initial_max_pool') inputs = block_group( inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0], strides=1, is_training=is_training, name='block_group1', data_format=data_format) inputs = block_group( inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1], strides=2, is_training=is_training, name='block_group2', data_format=data_format) inputs = block_group( inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2], strides=2, is_training=is_training, name='block_group3', data_format=data_format) inputs = block_group( inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3], strides=2, is_training=is_training, name='block_group4', data_format=data_format) # The activation is 7x7 so this is a global average pool. # TODO(huangyp): reduce_mean will be faster. pool_size = (inputs.shape[1], inputs.shape[2]) inputs = tf.layers.average_pooling2d( inputs=inputs, pool_size=pool_size, strides=1, padding='VALID', data_format=data_format) inputs = tf.identity(inputs, 'final_avg_pool') inputs = tf.reshape( inputs, [-1, 2048 if block_fn is bottleneck_block else 512]) if is_training and FLAGS.mlperf_logging: mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_DENSE, value=num_classes) inputs = tf.layers.dense( inputs=inputs, units=num_classes, kernel_initializer=tf.random_normal_initializer(stddev=.01)) inputs = tf.identity(inputs, 'final_dense') if is_training and FLAGS.mlperf_logging: mlperf_log.resnet_print( key=mlperf_log.MODEL_HP_FINAL_SHAPE, value=inputs.shape.as_list()[1:]) return inputs
def log_conv2d(input_tensor, output_tensor, stride, filters, initializer, use_bias): mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_CONV2D_FIXED_PADDING, value=_in_out_shape(input_tensor, output_tensor), stack_offset=_STACK_OFFSET) mlperf_log.resnet_print( key=mlperf_log.MODEL_HP_CONV2D_FIXED_PADDING, value={"stride": stride, "filters": filters, "initializer": initializer, "use_bias": use_bias}, stack_offset=_STACK_OFFSET)
def batch_norm_relu(inputs, is_training, relu=True, init_zero=False, data_format='channels_first'): """Performs a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training: `bool` for whether the model is training. relu: `bool` if False, omits the ReLU operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = 3 outputs = tf.layers.batch_normalization( inputs=inputs, axis=axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, center=True, scale=True, training=is_training, fused=True, gamma_initializer=gamma_initializer) if is_training: resnet_log_helper.log_batch_norm(input_tensor=inputs, output_tensor=outputs, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, center=True, scale=True, training=is_training) if relu: if is_training: mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) outputs = tf.nn.relu(outputs) return outputs
def log_train_epochs(self, num_epochs): """Logs all the TRAIN_EPOCHs log lines.""" num_epochs_int = int(num_epochs) for i in range(num_epochs_int): # MLPerf allows us to print all the train epochs at once instead of # printing them as we do them. mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=i) if num_epochs_int != num_epochs: value = ( str(num_epochs_int) + ', but this epoch only has {}% of the examples of a normal epoch' .format(100 * (num_epochs - num_epochs_int))) mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=value)
def after_run(self, run_context, run_values): # pylint: disable=unused-argument """Runs evaluator.""" step = np.asscalar(run_context.session.run(self._global_step_tensor)) if self._timer.should_trigger_for_step(step): logging.info('Starting eval.') eval_results = self._evaluate(run_context.session, step) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print( key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': max(step // self._steps_per_epoch - 1, 0), 'value': float(eval_results[_EVAL_METRIC]) }) # The ImageNet eval size is hard coded. mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=50000) if eval_results[_EVAL_METRIC] >= self._stop_threshold: self._run_success = True mlperf_log.resnet_print( key=mlperf_log.RUN_STOP, value={'success': 'true'}) run_context.request_stop() if step // self._steps_per_epoch == self._eval_every_epoch_from: self._timer = training.SecondOrStepTimer( every_steps=self._steps_per_epoch) self._timer.reset()
def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None, dtype=tf.float32, mix_up=False, oss_load=False): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. num_gpus: The number of gpus used for training. dtype: Data type to use for images/features Returns: A dataset that can be used for iteration. """ mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER) if not oss_load: filenames = get_filenames(is_training, data_dir) else: filenames = get_filenames_oss(is_training) #print(filenames) dataset = tf.data.Dataset.from_tensor_slices(filenames) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records dataset = dataset.flat_map(tf.data.TFRecordDataset) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record, num_epochs=num_epochs, num_gpus=num_gpus, examples_per_epoch=_NUM_IMAGES['train'] if is_training else None, dtype=dtype, mix_up=mix_up)
def block_m3(inputs, filters, training, strides, data_format): resnet_log_helper.log_begin_block(input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK) avg_pool = tf.layers.average_pooling2d(inputs=inputs, pool_size=strides, strides=strides, padding='SAME', data_format=data_format) shortcut = conv2d_fixed_padding(inputs=avg_pool, filters=1664, kernel_size=1, strides=1, data_format=data_format) resnet_log_helper.log_projection(input_tensor=inputs, output_tensor=shortcut) shortcut = batch_norm(inputs=shortcut, training=training, data_format=data_format) inputs = conv2d_fixed_padding(inputs=inputs, filters=256, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding(inputs=inputs, filters=256, kernel_size=3, strides=strides, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding(inputs=inputs, filters=1664, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD) inputs += shortcut mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) resnet_log_helper.log_end_block(output_tensor=inputs) return inputs
def make_source_dataset(self, index, num_hosts): """See base class.""" if not self.data_dir: tf.logging.info('Undefined data_dir implies null input') return tf.data.Dataset.range(1).repeat().map(self._get_null_input) if FLAGS.mlperf_logging: mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER) # Shuffle the filenames to ensure better randomization. file_pattern = os.path.join( self.data_dir, 'train-*' if self.is_training else 'validation-*') # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, # allowing us to cache larger datasets in memory. dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False) dataset = dataset.shard(num_hosts, index) if self.is_training and not self.cache: dataset = dataset.repeat() def fetch_dataset(filename): buffer_size = 8 * 1024 * 1024 # 8 MiB per file dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size) return dataset # Read the data from disk in parallel dataset = dataset.apply( tf.contrib.data.parallel_interleave( fetch_dataset, cycle_length=self.num_parallel_calls, sloppy=True)) if FLAGS.mlperf_logging: mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER) if self.cache: dataset = dataset.cache() if self.is_training: # We shuffle only during training, and during training, we must produce an # infinite dataset, so apply the fused shuffle_and_repeat optimized # dataset transformation. dataset = dataset.apply( tf.contrib.data.shuffle_and_repeat(1024 * 16)) return dataset
def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. num_channels: Integer depth of the image buffer for decoding. is_training: `True` if we're preprocessing the image for training and `False` otherwise. Returns: A preprocessed image. """ if is_training: # For training, we want to randomize some of the distortions. image = _decode_crop_and_flip(image_buffer, bbox, num_channels) mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE, value=[output_height, output_width]) image = _resize_image(image, output_height, output_width) else: # For validation, we want to decode, resize, then just crop the middle. image = tf.image.decode_jpeg(image_buffer, channels=num_channels) image = _aspect_preserving_resize(image, _RESIZE_MIN) mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE, value=[output_height, output_width]) image = _central_crop(image, output_height, output_width) image.set_shape([output_height, output_width, num_channels]) return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
def mx_resnet_print(key, val=None, sync=False, uniq=True, stack_offset=1, deferred=False): rank = mpiwrapper.rank() if sync: mpiwrapper.barrier() if uniq and (rank != 0): return mlperf_log.resnet_print(key=key, value=val, stack_offset=stack_offset, deferred=deferred) return
def _aspect_preserving_resize(image, resize_min): """Resize images preserving the original aspect ratio. Args: image: A 3-D image `Tensor`. resize_min: A python integer or scalar `Tensor` indicating the size of the smallest side after resize. Returns: resized_image: A 3-D tensor containing the resized image. """ mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING, value={"min": resize_min}) shape = tf.shape(image) height, width = shape[0], shape[1] new_height, new_width = _smallest_size_at_least(height, width, resize_min) return _resize_image(image, new_height, new_width)
def main(argv): parser = resnet_run_loop.ResnetArgParser( resnet_size_choices=[18, 34, 50, 101, 152, 200]) parser.set_defaults( train_epochs=90, version=1 ) flags = parser.parse_args(args=argv[2:]) seed = int(argv[1]) print('Setting random seed = ', seed) print('special seeding') mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) random.seed(seed) tf.compat.v1.set_random_seed(seed) numpy.random.seed(seed) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=_NUM_IMAGES['train']) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=_NUM_IMAGES['validation']) input_function = flags.use_synthetic_data and get_synth_input_fn() or input_fn resnet_run_loop.resnet_main(seed, flags, imagenet_model_fn, input_function, shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
def main(argv): parser = resnet_run_loop.ResnetArgParser( resnet_size_choices=[18, 26, 34, 50, 101, 152, 200]) parser.set_defaults(train_epochs=90, version=1) flags = parser.parse_args(args=argv[2:]) if flags.oss_load: auth = oss2.Auth(_ACCESS_ID, _ACCESS_KEY) bucket = oss2.Bucket(auth, _HOST, _BUCKET) seed = int(argv[1]) print('Setting random seed = ', seed) print('special seeding') mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed) random.seed(seed) tf.set_random_seed(seed) np.random.seed(seed) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=_NUM_IMAGES['train']) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=_NUM_IMAGES['validation']) input_function = input_fn resnet_run_loop.resnet_main( seed, flags, imagenet_model_fn, input_function, shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
def preprocess_image(image_bytes, is_training=False, use_bfloat16=False, image_size=IMAGE_SIZE): """Preprocesses the given image. Args: image_bytes: `Tensor` representing an image binary of arbitrary size. is_training: `bool` for whether the preprocessing is for training. use_bfloat16: `bool` for whether to use bfloat16. image_size: image size. Returns: A preprocessed image `Tensor` with value range of [0, 255]. """ if is_training: mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE, value=[IMAGE_SIZE, IMAGE_SIZE]) return preprocess_for_train(image_bytes, use_bfloat16, image_size) else: mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE, value=[IMAGE_SIZE, IMAGE_SIZE]) return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
def _central_crop(image, crop_height, crop_width): """Performs central crops of the given image list. Args: image: a 3-D image tensor crop_height: the height of the image following the crop. crop_width: the width of the image following the crop. Returns: 3-D tensor with cropped image. """ shape = tf.shape(image) height, width = shape[0], shape[1] mlperf_log.resnet_print(key=mlperf_log.INPUT_CENTRAL_CROP, value=[crop_height, crop_width]) amount_to_be_cropped_h = (height - crop_height) crop_top = amount_to_be_cropped_h // 2 amount_to_be_cropped_w = (width - crop_width) crop_left = amount_to_be_cropped_w // 2 return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
def block_m4(inputs, filters, training, strides, data_format): resnet_log_helper.log_begin_block(input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK) shortcut = inputs inputs = conv2d_fixed_padding(inputs=inputs, filters=384, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding( inputs=inputs, filters=384, kernel_size=3, strides=strides, # 384 data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding(inputs=inputs, filters=1664, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD) inputs += shortcut mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) resnet_log_helper.log_end_block(output_tensor=inputs) return inputs
def end(self, session): # pylint: disable=unused-argument """Runs evaluator for final model.""" # Only runs eval at the end if highest accuracy so far # is less than self._stop_threshold. if not self._run_success: step = np.asscalar(session.run(self._global_step_tensor)) logging.info('Starting eval.') eval_results = self._evaluate(session, step) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print( key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': max(step // self._steps_per_epoch - 1, 0), 'value': float(eval_results[_EVAL_METRIC]) }) if eval_results[_EVAL_METRIC] >= self._stop_threshold: mlperf_log.resnet_print( key=mlperf_log.RUN_STOP, value={'success': 'true'}) else: mlperf_log.resnet_print( key=mlperf_log.RUN_STOP, value={'success': 'false'})
def get_train_op(loss, params): """Generate training operation that updates variables based on loss.""" with tf.variable_scope("get_train_op"): mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=params.learning_rate_warmup_steps) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps) log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True) learning_rate = tf_mlperf_log.log_deferred(op=learning_rate, log_id=log_id, every_n=100) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=mlperf_log.LAZY_ADAM) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=params.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=params.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=params.optimizer_adam_epsilon) optimizer = tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) train_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train") # Save gradient norm to Tensorboard tf.summary.scalar("global_norm/gradient_norm", tf.global_norm(list(zip(*gradients))[0])) return train_op
def __call__(self, inputs, training): """Add operations to classify a batch of input images. Args: inputs: A Tensor representing a batch of input images. training: A boolean. Set to True to add operations required only when training the classifier. Returns: A logits Tensor with shape [<batch_size>, self.num_classes]. """ # Drop batch size from shape logging. mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_INITIAL_SHAPE, value=inputs.shape.as_list()[1:]) with self._model_variable_scope(): if self.data_format == 'channels_first': # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). # This provides a large performance boost on GPU. See # https://www.tensorflow.org/performance/performance_guide#data_formats inputs = tf.transpose(inputs, [0, 3, 1, 2]) if (self.resnet_version == 1) or (self.resnet_version == 2): inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.conv_stride, data_format=self.data_format) elif self.resnet_version == 14: # v1d architecture inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters // 2, kernel_size=3, strides=self.conv_stride, data_format=self.data_format) inputs = batch_norm(inputs, training, self.data_format) inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters // 2, kernel_size=3, strides=1, data_format=self.data_format) inputs = batch_norm(inputs, training, self.data_format) inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters, kernel_size=3, strides=1, data_format=self.data_format) elif self.resnet_version == 24: # v1-simple architecture inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.conv_stride, data_format=self.data_format) elif self.resnet_version == 34: # v1cs architecture inputs = conv2d_fixed_padding( inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.conv_stride, data_format=self.data_format) inputs = tf.identity(inputs, 'initial_conv') # We do not include batch normalization or activation functions in V2 # for the initial conv1 because the first ResNet unit will perform these # for both the shortcut and non-shortcut paths as part of the first # block's projection. Cf. Appendix of [2]. if (self.resnet_version == 1) or (self.resnet_version == 14) \ or (self.resnet_version == 24) or (self.resnet_version == 34): inputs = batch_norm(inputs, training, self.data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) if self.first_pool_size: pooled_inputs = tf.layers.max_pooling2d( inputs=inputs, pool_size=self.first_pool_size, strides=self.first_pool_stride, padding='SAME', data_format=self.data_format) resnet_log_helper.log_max_pool(input_tensor=inputs, output_tensor=pooled_inputs) inputs = tf.identity(pooled_inputs, 'initial_max_pool') feat_s = list() if self.resnet_version == 24: # mlperf open import nets.manual_blocks_t as manual_blocks # group1 for i, num_blocks in enumerate(self.block_sizes[0:1]): num_filters = self.num_filters * (2**i) inputs = block_layer( inputs=inputs, filters=num_filters, bottleneck=self.bottleneck, block_fn=self.block_fn, blocks=num_blocks, strides=self.block_strides[i], training=training, name='block_layer{}'.format(i + 1), data_format=self.data_format, version=self.resnet_version) # group2 inputs = manual_blocks.block_m(inputs=inputs, filters=128, training=training, strides=2, data_format=self.data_format) inputs = manual_blocks.block_m0(inputs=inputs, filters=128, training=training, strides=1, data_format=self.data_format) # group3 inputs = manual_blocks.block_m1(inputs=inputs, filters=256, training=training, strides=2, data_format=self.data_format) inputs = manual_blocks.block_m2(inputs=inputs, filters=256, training=training, strides=1, data_format=self.data_format) if self.enable_at: feat_s.append(inputs) # group4 inputs = manual_blocks.block_m3(inputs=inputs, filters=512, training=training, strides=2, data_format=self.data_format) inputs = manual_blocks.block_m4(inputs=inputs, filters=512, training=training, strides=1, data_format=self.data_format) if self.enable_at: feat_s.append(inputs) elif self.resnet_version == 34: # dawnbench import nets.manual_blocks as manual_blocks # group1~2 for i, num_blocks in enumerate(self.block_sizes[0:2]): num_filters = self.num_filters * (2**i) inputs = block_layer( inputs=inputs, filters=num_filters, bottleneck=self.bottleneck, block_fn=self.block_fn, blocks=num_blocks, strides=self.block_strides[i], training=training, name='block_layer{}'.format(i + 1), data_format=self.data_format, version=self.resnet_version) # group3 inputs = manual_blocks.block_m1(inputs=inputs, filters=256, training=training, strides=2, data_format=self.data_format) inputs = manual_blocks.block_m2(inputs=inputs, filters=256, training=training, strides=1, data_format=self.data_format) if self.enable_at: feat_s.append(inputs) # group4 inputs = manual_blocks.block_m3(inputs=inputs, filters=512, training=training, strides=2, data_format=self.data_format) inputs = manual_blocks.block_m4(inputs=inputs, filters=512, training=training, strides=1, data_format=self.data_format) if self.enable_at: feat_s.append(inputs) else: # standard v1, v1d, v2 for i, num_blocks in enumerate(self.block_sizes): num_filters = self.num_filters * (2**i) inputs = block_layer( inputs=inputs, filters=num_filters, bottleneck=self.bottleneck, block_fn=self.block_fn, blocks=num_blocks, strides=self.block_strides[i], training=training, name='block_layer{}'.format(i + 1), data_format=self.data_format, version=self.resnet_version) if (i > 1) and self.enable_at: feat_s.append(inputs) # Only apply the BN and ReLU for model that does pre_activation in each # building/bottleneck block, eg resnet V2. if self.pre_activation: inputs = batch_norm(inputs, training, self.data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) # The current top layer has shape # `batch_size x pool_size x pool_size x final_size`. # ResNet does an Average Pooling layer over pool_size, # but that is the same as doing a reduce_mean. We do a reduce_mean # here because it performs better than AveragePooling2D. axes = [2, 3] if self.data_format == 'channels_first' else [1, 2] inputs = tf.reduce_mean(inputs, axes, keepdims=True) inputs = tf.identity(inputs, 'final_reduce_mean') inputs = tf.reshape(inputs, [-1, inputs.get_shape().as_list()[-1]]) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_DENSE, value=self.num_classes) inputs = tf.layers.dense( inputs=inputs, units=self.num_classes, kernel_initializer=tf.random_normal_initializer(stddev=.01)) inputs = tf.identity(inputs, 'final_dense') # Drop batch size from shape logging. mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_FINAL_SHAPE, value=inputs.shape.as_list()[1:]) return inputs, feat_s
def _bottleneck_block_v1(inputs, filters, training, projection_shortcut, strides, data_format): """A single block for ResNet v1, with a bottleneck. Similar to _building_block_v1(), except using the "bottleneck" blocks described in: Convolution then batch normalization then ReLU as described by: Deep Residual Learning for Image Recognition https://arxiv.org/pdf/1512.03385.pdf by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015. Args: inputs: A tensor of size [batch, channels, height_in, width_in] or [batch, height_in, width_in, channels] depending on data_format. filters: The number of filters for the convolutions. training: A Boolean for whether the model is in training or inference mode. Needed for batch normalization. projection_shortcut: The function to use for projection shortcuts (typically a 1x1 convolution when downsampling the input). strides: The block's stride. If greater than 1, this block will ultimately downsample the input. data_format: The input format ('channels_last' or 'channels_first'). Returns: The output tensor of the block; shape should match inputs. """ resnet_log_helper.log_begin_block( input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK) shortcut = inputs if projection_shortcut is not None: shortcut = projection_shortcut(inputs) resnet_log_helper.log_projection(input_tensor=inputs, output_tensor=shortcut) shortcut = batch_norm(inputs=shortcut, training=training, data_format=data_format) inputs = conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=3, strides=strides, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) inputs = conv2d_fixed_padding( inputs=inputs, filters=4 * filters, kernel_size=1, strides=1, data_format=data_format) inputs = batch_norm(inputs, training, data_format) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD) inputs += shortcut mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) inputs = tf.nn.relu(inputs) resnet_log_helper.log_end_block(output_tensor=inputs) return inputs
def get_train_op(loss, params): """Generate training operation that updates variables based on loss.""" with tf.compat.v1.variable_scope("get_train_op"): mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=params.learning_rate_warmup_steps) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps) log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True) learning_rate = tf_mlperf_log.log_deferred(op=learning_rate, log_id=log_id, every_n=100) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. mlperf_log.transformer_print(key=mlperf_log.OPT_NAME, value=mlperf_log.LAZY_ADAM) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1, value=params.optimizer_adam_beta1) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2, value=params.optimizer_adam_beta2) mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON, value=params.optimizer_adam_epsilon) # Using optimizer v1(from tensorflow.python.trainings*) # The optimizer v2 version of code is in the below. # Optimzer v1 does not # have lazyAdam optimizer (was in contrib, now deprecated) optimizer = adam.AdamOptimizer(learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train") # Save gradient norm to Tensorboard tf.compat.v1.summary.scalar( "global_norm/gradient_norm", tf.linalg.global_norm(list(zip(*grads_and_vars))[0])) # Using tfa (tensorflow_addons) optimizer, which in turn # uses optimizer_v2 (from tf.python.keras.optimizer_v2) # which has waringin issues about global step not updated since # global_step is not accepted in apply_gradients() function of # optimizer_v2 version. # Thus the global step is updated and grouped with training op # To activate LazyAdams from tensroflow-addons activate the # following code and take out the above optimer v1 related code # Currently both optimizer v1 and v2 take about same time ''' optimizer = tfa.optimizers.LazyAdam( learning_rate, beta_1=params.optimizer_adam_beta1, beta_2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.compat.v1.train.get_global_step() tvars = tf.compat.v1.trainable_variables() tvars = tvars[0:len(tvars)-1] gradients = optimizer.get_gradients( loss, tvars) grads_and_vars = zip(gradients, tvars) train_op = optimizer.apply_gradients( grads_and_vars) # Save gradient norm to Tensorboard tf.compat.v1.summary.scalar("global_norm/gradient_norm", tf.compat.v1.linalg.global_norm(list(gradients))) update_global_step = tf.compat.v1.assign(global_step, global_step + 1, name = "update_global_step") train_op = tf.compat.v1.group(train_op, [(update_global_step)]) ''' return train_op
def _decode_crop_and_flip(image_buffer, bbox, num_channels): """Crops the given image to a random part of the image, and randomly flips. We use the fused decode_and_crop op, which performs better than the two ops used separately in series, but note that this requires that the image be passed in as an un-decoded string Tensor. Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. num_channels: Integer depth of the image buffer for decoding. Returns: 3-D tensor with cropped image. """ # A large fraction of image datasets contain a human-annotated bounding box # delineating the region of the image containing the object of interest. We # choose to create a new bounding box for the object which is a randomly # distorted version of the human-annotated bounding box that obeys an # allowed range of aspect ratios, sizes and overlap with the human-annotated # bounding box. If no box is supplied, then we assume the bounding box is # the entire image. min_object_covered = 0.1 aspect_ratio_range = [0.75, 1.33] area_range = [0.05, 1.0] max_attempts = 100 mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_MIN_OBJ_COV, value=min_object_covered) mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_RATIO_RANGE, value=aspect_ratio_range) mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_AREA_RANGE, value=area_range) mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_MAX_ATTEMPTS, value=max_attempts) sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( tf.image.extract_jpeg_shape(image_buffer), bounding_boxes=bbox, min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=max_attempts, use_image_if_no_bounding_boxes=True) bbox_begin, bbox_size, _ = sample_distorted_bounding_box # Reassemble the bounding box in the format the crop op requires. offset_y, offset_x, _ = tf.unstack(bbox_begin) target_height, target_width, _ = tf.unstack(bbox_size) crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) # Use the fused decode and crop op here, which is faster than each in series. cropped = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels) # Flip to add a little more random distortion in. mlperf_log.resnet_print(key=mlperf_log.INPUT_RANDOM_FLIP) cropped = tf.image.random_flip_left_right(cropped) return cropped
def bottleneck_block(inputs, filters, is_training, strides, use_projection=False, data_format='channels_first'): """Bottleneck block variant for residual networks with BN after convolutions. Args: inputs: `Tensor` of size `[batch, channels, height, width]`. filters: `int` number of filters for the first two convolutions. Note that the third and final convolution will use 4 times as many filters. is_training: `bool` for whether the model is in training. strides: `int` block stride. If greater than 1, this block will ultimately downsample the input. use_projection: `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: The output `Tensor` of the block. """ if is_training and FLAGS.mlperf_logging: mlperf_log.resnet_print( key=mlperf_log.MODEL_HP_BLOCK_TYPE, value=mlperf_log.BOTTLENECK_BLOCK) resnet_log_helper.log_begin_block( input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK) shortcut = inputs if use_projection: # Projection shortcut only in first block within a group. Bottleneck blocks # end with 4 times the number of filters. filters_out = 4 * filters shortcut = conv2d_fixed_padding( inputs=inputs, filters=filters_out, kernel_size=1, strides=strides, is_training=is_training, data_format=data_format) shortcut = batch_norm_relu(shortcut, is_training, relu=False, data_format=data_format) if is_training and FLAGS.mlperf_logging: resnet_log_helper.log_projection( input_tensor=inputs, output_tensor=shortcut) inputs = conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=1, strides=1, is_training=is_training, data_format=data_format) inputs = batch_norm_relu(inputs, is_training, data_format=data_format) inputs = conv2d_fixed_padding( inputs=inputs, filters=filters, kernel_size=3, strides=strides, is_training=is_training, data_format=data_format) inputs = batch_norm_relu(inputs, is_training, data_format=data_format) inputs = conv2d_fixed_padding( inputs=inputs, filters=4 * filters, kernel_size=1, strides=1, is_training=is_training, data_format=data_format) inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True, data_format=data_format) output = tf.nn.relu(inputs + shortcut) if is_training and FLAGS.mlperf_logging: mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD) mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU) resnet_log_helper.log_end_block(output_tensor=output) return output