def add_variable(self, name, shape=None, **kwargs): variable = super().add_variable(lottery.weight_name_of_base_name(name), shape, **kwargs) mask = super().add_variable(lottery.mask_name_of_base_name(name), shape, trainable=False, initializer=tf.initializers.ones()) return tf.math.multiply(variable, mask)
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): """Strided 2-D convolution with explicit padding.""" # The padding is consistent and is based only on `kernel_size`, not on the # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). global _layer_idx, _proj_idx if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format) if data_format == 'channels_last': input_dim = inputs.shape[-1] channel_format = 'NHWC' strides_tuple = [1, strides, strides, 1] else: input_dim = inputs.shape[1] channel_format = 'NCHW' strides_tuple = [1, 1, strides, strides] kernel_shape = (kernel_size, kernel_size, input_dim, filters) base_name = ('conv_{layer_idx}' if kernel_size > 1 else 'proj_{layer_idx}_{proj_idx}').format( layer_idx=_layer_idx, proj_idx=_proj_idx, ) kernel = tf.get_variable( initializer=tf.compat.v1.variance_scaling_initializer(), trainable=True, shape=kernel_shape, dtype=inputs.dtype, name=lottery.weight_name_of_base_name(base_name)) mask = tf.get_variable( initializer=tf.ones(kernel_shape, dtype=inputs.dtype), trainable=False, dtype=inputs.dtype, name=lottery.mask_name_of_base_name(base_name), ) filters = tf.math.multiply(kernel, mask) outputs = tf.nn.conv2d( inputs, filters, strides_tuple, padding=('SAME' if strides == 1 else 'VALID'), data_format=channel_format, ) _proj_idx += 1 if kernel_size > 1: _layer_idx += 1 return outputs
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, predicted_ids) for infererence and (loss, None) for training. where: logits: float32 Tensor [batch_size x num_decoder_symbols] loss: float32 scalar predicted_ids: predicted ids from beam search. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# Creating %s graph ..." % self.mode) # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): output_layer = tf.get_variable( lottery.weight_name_of_base_name("output_projection"), [self.num_units, self.tgt_vocab_size]) output_layer_mask = tf.get_variable( lottery.mask_name_of_base_name("output_projection"), [self.num_units, self.tgt_vocab_size], trainable=False, initializer=tf.initializers.ones() ) self.output_layer = tf.math.multiply(output_layer, output_layer_mask) with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): if hparams.activation_dtype == "bfloat16": tf.get_variable_scope().set_custom_getter( utils.bfloat16_var_getter if hparams.activation_dtype == "bfloat16" else None) logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model( hparams) if decoder_cell_outputs is not None: decoder_cell_outputs = tf.cast(decoder_cell_outputs, tf.float32) else: logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model( hparams) return logits_or_loss, predicted_ids
def _create_or_load_embed(embed_name, vocab_file, embed_file, vocab_size, embed_size, dtype): """Create a new or load an existing embedding matrix.""" if vocab_file and embed_file: embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file) else: embedding = tf.get_variable( lottery.weight_name_of_base_name(embed_name), [vocab_size, embed_size], dtype) embedding_mask = tf.get_variable( lottery.mask_name_of_base_name(embed_name), [vocab_size, embed_size], dtype, trainable=False, initializer=tf.initializers.ones()) embedding = tf.math.multiply(embedding, embedding_mask) return embedding
def __call__(self, inputs, training): """Add operations to classify a batch of input images. Args: inputs: A Tensor representing a batch of input images. training: A boolean. Set to True to add operations required only when training the classifier. Returns: A logits Tensor with shape [<batch_size>, self.num_classes]. """ global _layer_idx, _proj_idx _layer_idx = 1 _proj_idx = 1 with self._model_variable_scope(): if self.data_format == 'channels_first': # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). # This provides a large performance boost on GPU. See # https://www.tensorflow.org/performance/performance_guide#data_formats inputs = tf.transpose(inputs, [0, 3, 1, 2]) inputs = conv2d_fixed_padding(inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.conv_stride, data_format=self.data_format) inputs = tf.identity(inputs, 'initial_conv') # We do not include batch normalization or activation functions in V2 # for the initial conv1 because the first ResNet unit will perform these # for both the shortcut and non-shortcut paths as part of the first # block's projection. Cf. Appendix of [2]. if self.resnet_version == 1: inputs = batch_norm(inputs, training, self.data_format) inputs = tf.nn.relu(inputs) if self.first_pool_size: inputs = tf.layers.max_pooling2d( inputs=inputs, pool_size=self.first_pool_size, strides=self.first_pool_stride, padding='SAME', data_format=self.data_format) inputs = tf.identity(inputs, 'initial_max_pool') for i, num_blocks in enumerate(self.block_sizes): num_filters = self.num_filters * (2**i) inputs = block_layer(inputs=inputs, filters=num_filters, bottleneck=self.bottleneck, block_fn=self.block_fn, blocks=num_blocks, strides=self.block_strides[i], training=training, name='block_layer{}'.format(i + 1), data_format=self.data_format) # Only apply the BN and ReLU for model that does pre_activation in each # building/bottleneck block, eg resnet V2. if self.pre_activation: inputs = batch_norm(inputs, training, self.data_format) inputs = tf.nn.relu(inputs) # The current top layer has shape # `batch_size x pool_size x pool_size x final_size`. # ResNet does an Average Pooling layer over pool_size, # but that is the same as doing a reduce_mean. We do a reduce_mean # here because it performs better than AveragePooling2D. axes = [2, 3] if self.data_format == 'channels_first' else [1, 2] inputs = tf.reduce_mean(inputs, axes, keepdims=True) inputs = tf.identity(inputs, 'final_reduce_mean') inputs = tf.squeeze(inputs, axes) dense_w = tf.get_variable( initializer=tf.random_normal_initializer(stddev=.01), trainable=True, shape=(inputs.shape[-1], self.num_classes), dtype=inputs.dtype, name=lottery.weight_name_of_base_name('dense')) dense_b = tf.get_variable( initializer=tf.zeros_initializer(), trainable=True, shape=(self.num_classes, ), dtype=inputs.dtype, name=lottery.bias_name_of_base_name('dense')) dense_mask = tf.get_variable( initializer=tf.ones(dense_w.shape, dtype=inputs.dtype), trainable=False, dtype=inputs.dtype, name=lottery.mask_name_of_base_name('dense')) inputs = inputs @ (dense_w * dense_mask) + dense_b inputs = tf.identity(inputs, 'final_dense') return inputs
def model(inputs, is_training): """Creation of the model graph.""" inputs = conv2d_fixed_padding(inputs=inputs, filters=64, kernel_size=7, strides=2, data_format=data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = batch_norm_relu(inputs, is_training, data_format=data_format) inputs = tf.layers.max_pooling2d(inputs=inputs, pool_size=3, strides=2, padding='SAME', data_format=data_format) inputs = tf.identity(inputs, 'initial_max_pool') inputs = block_group(inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0], strides=1, is_training=is_training, name='block_group1', data_format=data_format, dropblock_keep_prob=dropblock_keep_probs[0], dropblock_size=dropblock_size) inputs = block_group(inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1], strides=2, is_training=is_training, name='block_group2', data_format=data_format, dropblock_keep_prob=dropblock_keep_probs[1], dropblock_size=dropblock_size) inputs = block_group(inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2], strides=2, is_training=is_training, name='block_group3', data_format=data_format, dropblock_keep_prob=dropblock_keep_probs[2], dropblock_size=dropblock_size) inputs = block_group(inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3], strides=2, is_training=is_training, name='block_group4', data_format=data_format, dropblock_keep_prob=dropblock_keep_probs[3], dropblock_size=dropblock_size) # The activation is 7x7 so this is a global average pool. # TODO(huangyp): reduce_mean will be faster. pool_size = (inputs.shape[1], inputs.shape[2]) inputs = tf.layers.average_pooling2d(inputs=inputs, pool_size=pool_size, strides=1, padding='VALID', data_format=data_format) inputs = tf.identity(inputs, 'final_avg_pool') final_width = 2048 if block_fn is bottleneck_block else 512 inputs = tf.reshape(inputs, [-1, final_width]) dense_w = tf.get_variable( initializer=tf.random_normal_initializer(stddev=.01), trainable=True, shape=(final_width, num_classes), dtype=inputs.dtype, name=lottery.weight_name_of_base_name('dense')) dense_b = tf.get_variable(initializer=tf.zeros_initializer(), trainable=True, shape=(num_classes, ), dtype=inputs.dtype, name=lottery.bias_name_of_base_name('dense')) dense_mask = tf.get_variable( initializer=lambda: tf.ones(dense_w.shape, dtype=inputs.dtype), trainable=False, dtype=inputs.dtype, name=lottery.mask_name_of_base_name('dense')) inputs = inputs @ (dense_w * dense_mask) + dense_b inputs = tf.identity(inputs, 'final_dense') return inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format='channels_first', projection=False): """Strided 2-D convolution with explicit padding. The padding is consistent and is based only on `kernel_size`, not on the dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). Args: inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. filters: `int` number of filters in the convolution. kernel_size: `int` size of the kernel to be used in the convolution. strides: `int` strides of the convolution. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A `Tensor` of shape `[batch, filters, height_out, width_out]`. """ global _layer_idx, _proj_idx if strides > 1: inputs = fixed_padding(inputs, kernel_size, data_format) if data_format == 'channels_last': input_dim = inputs.shape[-1] channel_format = 'NHWC' strides_tuple = [1, strides, strides, 1] else: input_dim = inputs.shape[1] channel_format = 'NCHW' strides_tuple = [1, 1, strides, strides] kernel_shape = (kernel_size, kernel_size, input_dim, filters) base_name = ('conv_{layer_idx}' if not projection else 'proj_{layer_idx}_{proj_idx}').format( layer_idx=_layer_idx, proj_idx=_proj_idx, ) kernel = tf.get_variable( initializer=tf.compat.v1.variance_scaling_initializer(), trainable=True, shape=kernel_shape, dtype=inputs.dtype, name=lottery.weight_name_of_base_name(base_name)) mask = tf.get_variable( initializer=lambda: tf.ones(kernel_shape, dtype=inputs.dtype), trainable=False, dtype=inputs.dtype, name=lottery.mask_name_of_base_name(base_name), ) filters = tf.math.multiply(kernel, mask) outputs = tf.compat.v1.nn.conv2d( inputs, filters, strides_tuple, padding=('SAME' if strides == 1 else 'VALID'), data_format=channel_format, ) _proj_idx += 1 if not projection: _layer_idx += 1 return outputs