def res_block(x, in_features, out_features, stride, activate_before_res=False): if activate_before_res: with tf.variable_scope('shared_activation'): x = ops.batch_norm(x, scope='init_bn') x = tf.nn.relu(x) orig_x = x else: orig_x = x block_x = x if not activate_before_res: with tf.variable_scope('res_only_activation'): block_x = ops.batch_norm(block_x, scope='init_bn') block_x = tf.nn.relu(block_x) with tf.variable_scope('sub1'): block_x = ops.conv2d(block_x, out_features, 3, stride=stride, scope='conv1') with tf.variable_scope('sub2'): block_x = ops.batch_norm(block_x, scope='bn2') block_x = tf.nn.relu(block_x) block_x = ops.conv2d(block_x, out_features, 3, stride=1, scope='conv2') with tf.variable_scope('sub_add'): if in_features != out_features: orig_x = ops.avg_pool(orig_x, stride, stride) orig_x = ops.zero_pad(orig_x, in_features, out_features) output_data = orig_x + block_x return output_data
def residual_block(x, in_filter, out_filter, stride, activate_before_residual=False): """Adds residual connection to `x` in addition to applying BN->ReLU->3x3 Conv. Args: x: Tensor that is the output of the previous layer in the model. in_filter: Number of filters `x` has. out_filter: Number of filters that the output of this layer will have. stride: Integer that specified what stride should be applied to `x`. activate_before_residual: Boolean on whether a BN->ReLU should be applied to x before the convolution is applied. Returns: A Tensor that is the result of applying two sequences of BN->ReLU->3x3 Conv and then adding that Tensor to `x`. """ if activate_before_residual: # Pass up RELU and BN activation for resnet with tf.variable_scope('shared_activation'): x = ops.batch_norm(x, scope='init_bn') x = tf.nn.relu(x) orig_x = x else: orig_x = x block_x = x if not activate_before_residual: with tf.variable_scope('residual_only_activation'): block_x = ops.batch_norm(block_x, scope='init_bn') block_x = tf.nn.relu(block_x) with tf.variable_scope('sub1'): block_x = ops.conv2d(block_x, out_filter, 3, stride=stride, scope='conv1') with tf.variable_scope('sub2'): block_x = ops.batch_norm(block_x, scope='bn2') block_x = tf.nn.relu(block_x) block_x = ops.conv2d(block_x, out_filter, 3, stride=1, scope='conv2') with tf.variable_scope( 'sub_add'): # If number of filters do not agree then zero pad them if in_filter != out_filter: orig_x = ops.avg_pool(orig_x, stride, stride) orig_x = ops.zero_pad(orig_x, in_filter, out_filter) x = orig_x + block_x return x
def _shake_shake_branch(x, output_filters, stride, rand_forward, rand_backward, is_training): """Building a 2 branching convnet.""" x = tf.nn.relu(x) x = ops.conv2d(x, output_filters, 3, stride=stride, scope='conv1') x = ops.batch_norm(x, scope='bn1') x = tf.nn.relu(x) x = ops.conv2d(x, output_filters, 3, scope='conv2') x = ops.batch_norm(x, scope='bn2') if is_training: x = x * rand_backward + tf.stop_gradient(x * rand_forward - x * rand_backward) else: x *= 1.0 / 2 return x
def residual_block( x, in_filter, out_filter, stride, activate_before_residual=False): """Adds residual connection to `x` in addition to applying BN->ReLU->3x3 Conv. Args: x: Tensor that is the output of the previous layer in the model. in_filter: Number of filters `x` has. out_filter: Number of filters that the output of this layer will have. stride: Integer that specified what stride should be applied to `x`. activate_before_residual: Boolean on whether a BN->ReLU should be applied to x before the convolution is applied. Returns: A Tensor that is the result of applying two sequences of BN->ReLU->3x3 Conv and then adding that Tensor to `x`. """ if activate_before_residual: # Pass up RELU and BN activation for resnet with tf.variable_scope('shared_activation'): x = ops.batch_norm(x, scope='init_bn') x = tf.nn.relu(x) orig_x = x else: orig_x = x block_x = x if not activate_before_residual: with tf.variable_scope('residual_only_activation'): block_x = ops.batch_norm(block_x, scope='init_bn') block_x = tf.nn.relu(block_x) with tf.variable_scope('sub1'): block_x = ops.conv2d( block_x, out_filter, 3, stride=stride, scope='conv1') with tf.variable_scope('sub2'): block_x = ops.batch_norm(block_x, scope='bn2') block_x = tf.nn.relu(block_x) block_x = ops.conv2d( block_x, out_filter, 3, stride=1, scope='conv2') with tf.variable_scope( 'sub_add'): # If number of filters do not agree then zero pad them if in_filter != out_filter: orig_x = ops.avg_pool(orig_x, stride, stride) orig_x = ops.zero_pad(orig_x, in_filter, out_filter) x = orig_x + block_x return x
def build_shake_shake_model(images, num_classes, hparams, is_training): """Builds the Shake-Shake model. Build the Shake-Shake model from https://arxiv.org/abs/1705.07485. Args: images: Tensor of images that will be fed into the Wide ResNet Model. num_classes: Number of classed that the model needs to predict. hparams: tf.HParams object that contains additional hparams needed to construct the model. In this case it is the `shake_shake_widen_factor` that is used to determine how many filters the model has. is_training: Is the model training or not. Returns: The logits of the Shake-Shake model. """ depth = 26 k = hparams.shake_shake_widen_factor # The widen factor n = int((depth - 2) / 6) x = images x = ops.conv2d(x, 16, 3, scope='init_conv') x = ops.batch_norm(x, scope='init_bn') with tf.variable_scope('L1'): x = _shake_shake_layer(x, 16 * k, n, 1, is_training) with tf.variable_scope('L2'): x = _shake_shake_layer(x, 32 * k, n, 2, is_training) with tf.variable_scope('L3'): x = _shake_shake_layer(x, 64 * k, n, 2, is_training) x = tf.nn.relu(x) x = ops.global_avg_pool(x) # Fully connected logits = ops.fc(x, num_classes) return logits
def _shake_shake_skip_connection(x, output_filters, stride): """Adds a residual connection to the filter x for the shake-shake model.""" curr_filters = int(x.shape[3]) if curr_filters == output_filters: return x stride_spec = ops.stride_arr(stride, stride) # Skip path 1 path1 = tf.nn.avg_pool(x, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC') path1 = ops.conv2d(path1, int(output_filters / 2), 1, scope='path1_conv') # Skip path 2 # First pad with 0's then crop pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :] concat_axis = 3 path2 = tf.nn.avg_pool(path2, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC') path2 = ops.conv2d(path2, int(output_filters / 2), 1, scope='path2_conv') # Concat and apply BN final_path = tf.concat(values=[path1, path2], axis=concat_axis) final_path = ops.batch_norm(final_path, scope='final_path_bn') return final_path
def _shake_shake_skip_connection(x, output_filters, stride): """Adds a residual connection to the filter x for the shake-shake model.""" curr_filters = int(x.shape[3]) if curr_filters == output_filters: return x stride_spec = ops.stride_arr(stride, stride) # Skip path 1 path1 = tf.nn.avg_pool( x, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC') path1 = ops.conv2d(path1, int(output_filters / 2), 1, scope='path1_conv') # Skip path 2 # First pad with 0's then crop pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :] concat_axis = 3 path2 = tf.nn.avg_pool( path2, [1, 1, 1, 1], stride_spec, 'VALID', data_format='NHWC') path2 = ops.conv2d(path2, int(output_filters / 2), 1, scope='path2_conv') # Concat and apply BN final_path = tf.concat(values=[path1, path2], axis=concat_axis) final_path = ops.batch_norm(final_path, scope='final_path_bn') return final_path
def bottleneck_layer(x, n, stride, prob, is_training, alpha, beta): """Bottleneck layer for shake drop model.""" assert alpha[1] > alpha[0] assert beta[1] > beta[0] with tf.variable_scope('bottleneck_{}'.format(prob)): input_layer = x x = ops.batch_norm(x, scope='bn_1_pre') x = ops.conv2d(x, n, 1, scope='1x1_conv_contract') x = ops.batch_norm(x, scope='bn_1_post') x = tf.nn.relu(x) x = ops.conv2d(x, n, 3, stride=stride, scope='3x3') x = ops.batch_norm(x, scope='bn_2') x = tf.nn.relu(x) x = ops.conv2d(x, n * 4, 1, scope='1x1_conv_expand') x = ops.batch_norm(x, scope='bn_3') # Apply regularization here # Sample bernoulli with prob if is_training: batch_size = tf.shape(x)[0] bern_shape = [batch_size, 1, 1, 1] random_tensor = prob random_tensor += tf.random_uniform(bern_shape, dtype=tf.float32) binary_tensor = tf.floor(random_tensor) alpha_values = tf.random_uniform([batch_size, 1, 1, 1], minval=alpha[0], maxval=alpha[1], dtype=tf.float32) beta_values = tf.random_uniform([batch_size, 1, 1, 1], minval=beta[0], maxval=beta[1], dtype=tf.float32) rand_forward = (binary_tensor + alpha_values - binary_tensor * alpha_values) rand_backward = (binary_tensor + beta_values - binary_tensor * beta_values) x = x * rand_backward + tf.stop_gradient(x * rand_forward - x * rand_backward) else: expected_alpha = (alpha[1] + alpha[0]) / 2 # prob is the expectation of the bernoulli variable x = (prob + expected_alpha - prob * expected_alpha) * x res = shortcut(input_layer, n * 4, stride) return x + res
def build_wrn_model(images, num_classes, wrn_size): """Builds the WRN model. Build the Wide ResNet model from https://arxiv.org/abs/1605.07146. Args: images: Tensor of images that will be fed into the Wide ResNet Model. num_classes: Number of classed that the model needs to predict. wrn_size: Parameter that scales the number of filters in the Wide ResNet model. Returns: The logits of the Wide ResNet model. """ kernel_size = wrn_size filter_size = 3 num_blocks_per_resnet = 4 filters = [ min(kernel_size, 16), kernel_size, kernel_size * 2, kernel_size * 4 ] strides = [1, 2, 2] # stride for each resblock # Run the first conv with tf.variable_scope('init'): x = images output_filters = filters[0] x = ops.conv2d(x, output_filters, filter_size, scope='init_conv') first_x = x # Res from the beginning orig_x = x # Res from previous block for block_num in range(1, 4): with tf.variable_scope('unit_{}_0'.format(block_num)): activate_before_residual = True if block_num == 1 else False x = residual_block( x, filters[block_num - 1], filters[block_num], strides[block_num - 1], activate_before_residual=activate_before_residual) for i in range(1, num_blocks_per_resnet): with tf.variable_scope('unit_{}_{}'.format(block_num, i)): x = residual_block( x, filters[block_num], filters[block_num], 1, activate_before_residual=False) x, orig_x = _res_add(filters[block_num - 1], filters[block_num], strides[block_num - 1], x, orig_x) final_stride_val = np.prod(strides) x, _ = _res_add(filters[0], filters[3], final_stride_val, x, first_x) with tf.variable_scope('unit_last'): x = ops.batch_norm(x, scope='final_bn') x = tf.nn.relu(x) x = ops.global_avg_pool(x) logits = ops.fc(x, num_classes) return logits
def build_wrn_model(images, num_classes, wrn_size): """Builds the WRN model. Build the Wide ResNet model from https://arxiv.org/abs/1605.07146. Args: images: Tensor of images that will be fed into the Wide ResNet Model. num_classes: Number of classed that the model needs to predict. wrn_size: Parameter that scales the number of filters in the Wide ResNet model. Returns: The logits of the Wide ResNet model. """ kernel_size = wrn_size filter_size = 3 num_blocks_per_resnet = 4 filters = [ min(kernel_size, 16), kernel_size, kernel_size * 2, kernel_size * 4 ] strides = [1, 2, 2] # stride for each resblock # Run the first conv with tf.variable_scope('init'): x = images output_filters = filters[0] x = ops.conv2d(x, output_filters, filter_size, scope='init_conv') first_x = x # Res from the beginning orig_x = x # Res from previous block for block_num in range(1, 4): with tf.variable_scope('unit_{}_0'.format(block_num)): activate_before_residual = True if block_num == 1 else False x = residual_block( x, filters[block_num - 1], filters[block_num], strides[block_num - 1], activate_before_residual=activate_before_residual) for i in range(1, num_blocks_per_resnet): with tf.variable_scope('unit_{}_{}'.format(block_num, i)): x = residual_block(x, filters[block_num], filters[block_num], 1, activate_before_residual=False) x, orig_x = _res_add(filters[block_num - 1], filters[block_num], strides[block_num - 1], x, orig_x) final_stride_val = np.prod(strides) x, _ = _res_add(filters[0], filters[3], final_stride_val, x, first_x) with tf.variable_scope('unit_last'): x = ops.batch_norm(x, scope='final_bn') x = tf.nn.relu(x) x = ops.global_avg_pool(x) logits = ops.fc(x, num_classes) return logits
def bottleneck_layer(x, n, stride, prob, is_training, alpha, beta): """Bottleneck layer for shake drop model.""" assert alpha[1] > alpha[0] assert beta[1] > beta[0] with tf.variable_scope('bottleneck_{}'.format(prob)): input_layer = x x = ops.batch_norm(x, scope='bn_1_pre') x = ops.conv2d(x, n, 1, scope='1x1_conv_contract') x = ops.batch_norm(x, scope='bn_1_post') x = tf.nn.relu(x) x = ops.conv2d(x, n, 3, stride=stride, scope='3x3') x = ops.batch_norm(x, scope='bn_2') x = tf.nn.relu(x) x = ops.conv2d(x, n * 4, 1, scope='1x1_conv_expand') x = ops.batch_norm(x, scope='bn_3') # Apply regularization here # Sample bernoulli with prob if is_training: batch_size = tf.shape(x)[0] bern_shape = [batch_size, 1, 1, 1] random_tensor = prob random_tensor += tf.random_uniform(bern_shape, dtype=tf.float32) binary_tensor = tf.floor(random_tensor) alpha_values = tf.random_uniform( [batch_size, 1, 1, 1], minval=alpha[0], maxval=alpha[1], dtype=tf.float32) beta_values = tf.random_uniform( [batch_size, 1, 1, 1], minval=beta[0], maxval=beta[1], dtype=tf.float32) rand_forward = ( binary_tensor + alpha_values - binary_tensor * alpha_values) rand_backward = ( binary_tensor + beta_values - binary_tensor * beta_values) x = x * rand_backward + tf.stop_gradient(x * rand_forward - x * rand_backward) else: expected_alpha = (alpha[1] + alpha[0])/2 # prob is the expectation of the bernoulli variable x = (prob + expected_alpha - prob * expected_alpha) * x res = shortcut(input_layer, n * 4, stride) return x + res
def no_relu_residual_block(x, in_filter, out_filter, stride, weight_decay=0.0): orig_x = x block_x = x with tf.variable_scope('sub1'): block_x = ops.conv2d(block_x, out_filter, 3, stride=stride, scope='conv1', weight_decay=weight_decay) block_x = ops.batch_norm(block_x, 0.9, scope='bn1') with tf.variable_scope('sub2'): block_x = tf.nn.relu(block_x) block_x = ops.conv2d(block_x, out_filter, 3, stride=1, scope='conv2', weight_decay=weight_decay) block_x = ops.batch_norm(block_x, 0.9, scope='bn2') with tf.variable_scope( 'sub_add'): # If number of filters do not agree then zero pad them if in_filter != out_filter or stride != 1: orig_x = ops.avg_pool(orig_x, 1, stride) orig_x = ops.zero_pad(orig_x, in_filter, out_filter) #orig_x = ops.conv2d(orig_x, out_filter, 1, stride=stride, scope='shortcut') # orig_x = ops.batch_norm(orig_x, 0.9, scope='bn3') x = tf.nn.relu(orig_x + block_x) # with tf.variable_scope('sub1'): # block_x = ops.batch_norm(block_x, 0.9, scope='bn1') # block_x = tf.nn.relu(block_x) # block_x = ops.conv2d(block_x, out_filter, 3, stride=stride, scope='conv1', weight_decay=weight_decay) # with tf.variable_scope('sub2'): # block_x = ops.batch_norm(block_x, 0.9, scope='bn2') # block_x = tf.nn.relu(block_x) # block_x = ops.conv2d(block_x, out_filter, 3, stride=1, scope='conv2', weight_decay=weight_decay) # with tf.variable_scope('sub_add'): # If number of filters do not agree then zero pad them # if in_filter != out_filter or stride != 1: # #orig_x = ops.conv2d(orig_x, out_filter, 1, stride=stride, scope='shortcut', weight_decay=weight_decay) # #orig_x = ops.batch_norm(orig_x, 0.9, scope='bn3') # orig_x = # x = tf.nn.relu(orig_x + block_x) return x
def build_architecture(self): filter_size = 3 strides = [1, 2, 2] filters = [16, 16, 32, 64] if self.model_name == 'resnet_8': num_blocks_per_resnet = [1, 1, 1] elif self.model_name == 'resnet_14': num_blocks_per_resnet = [2, 2, 2] elif self.model_name == 'resnet_20': num_blocks_per_resnet = [3, 3, 3] elif self.model_name == 'resnet_26': num_blocks_per_resnet = [4, 4, 4] # First Convolutional Network with tf.variable_scope('init'): x = self.X output_filters = filters[0] x = ops.conv2d(x, output_filters, 3, 1, scope='init_conv', weight_decay=self.weight_decay) x = ops.batch_norm(x, 0.9, scope='init_bn') #x = tf.contrib.layers.batch_norm(x, decay=0.9, scope='init_bn', epsilon=0.00005, is_training=True) x = tf.nn.relu(x) for block_num in range(1, 4): with tf.variable_scope('unit_{}_0'.format(block_num)): x = no_relu_residual_block(x, filters[block_num - 1], filters[block_num], strides[block_num - 1], self.weight_decay) for i in range(1, num_blocks_per_resnet[block_num - 1]): with tf.variable_scope('unit_{}_{}'.format(block_num, i)): x = no_relu_residual_block(x, filters[block_num], filters[block_num], 1, self.weight_decay) print("Building.....") with tf.variable_scope('unit_last'): x = ops.global_avg_pool(x) self.orig_logits = ops.fc(x, self.num_classes, self.weight_decay)
def build_wrn_model(input_data, num_classes, wrn_size): kernel_size = 3 features = [min(wrn_size, 16), wrn_size, wrn_size * 2, wrn_size * 4] strides = [1, 2, 2] # stride for each resblock # create the first convolutional layer with tf.variable_scope('init'): x = ops.conv2d(input_data, features[0], kernel_size, scope='init_conv') first_x = x orig_x = x # create 2nd, 3rd, 4th resnet layers, two convs per res_block, four blocks per res layer. n=(28-4)/6 for num_res_layer in range(1, 4): with tf.variable_scope('unit_{}_0'.format(num_res_layer)): activate_before_res = True if num_res_layer == 1 else False x = res_block(orig_x, features[num_res_layer - 1], features[num_res_layer], stride=strides[num_res_layer - 1], activate_before_res=activate_before_res) for num_block in range(1, 4): with tf.variable_scope('unit_{}_{}'.format( num_res_layer, num_block)): x = res_block(x, features[num_res_layer], features[num_res_layer], 1, activate_before_res=False) x, orig_x = res_add(x, orig_x, features[num_res_layer - 1], features[num_res_layer], strides[num_res_layer - 1]) final_stride = np.prod(strides) x, _ = res_add(x, first_x, features[0], features[3], final_stride) with tf.variable_scope('unit_last'): x = ops.batch_norm(x, scope='final_bn') x = tf.nn.relu(x) x = ops.global_avg_pool(x) logits = ops.fc(x, num_classes) return logits
def build_shake_drop_model(images, num_classes, is_training): """Builds the PyramidNet Shake-Drop model. Build the PyramidNet Shake-Drop model from https://arxiv.org/abs/1802.02375. Args: images: Tensor of images that will be fed into the Wide ResNet Model. num_classes: Number of classed that the model needs to predict. is_training: Is the model training or not. Returns: The logits of the PyramidNet Shake-Drop model. """ # ShakeDrop Hparams p_l = 0.5 alpha_shake = [-1, 1] beta_shake = [0, 1] # PyramidNet Hparams alpha = 200 depth = 272 # This is for the bottleneck architecture specifically n = int((depth - 2) / 9) start_channel = 16 add_channel = alpha / (3 * n) # Building the models x = images x = ops.conv2d(x, 16, 3, scope='init_conv') x = ops.batch_norm(x, scope='init_bn') layer_num = 1 total_layers = n * 3 start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 1, prob, is_training, alpha_shake, beta_shake) layer_num += 1 for _ in range(1, n): start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 1, prob, is_training, alpha_shake, beta_shake) layer_num += 1 start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 2, prob, is_training, alpha_shake, beta_shake) layer_num += 1 for _ in range(1, n): start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 1, prob, is_training, alpha_shake, beta_shake) layer_num += 1 start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 2, prob, is_training, alpha_shake, beta_shake) layer_num += 1 for _ in range(1, n): start_channel += add_channel prob = calc_prob(layer_num, total_layers, p_l) x = bottleneck_layer( x, round_int(start_channel), 1, prob, is_training, alpha_shake, beta_shake) layer_num += 1 assert layer_num - 1 == total_layers x = ops.batch_norm(x, scope='final_bn') x = tf.nn.relu(x) x = ops.global_avg_pool(x) # Fully connected logits = ops.fc(x, num_classes) return logits
def inference(sess, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: logits. """ # data layout: N, T, F # feat_len = feats.get_shape().as_list()[-1] # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: ## N, T, F feats = tf.expand_dims(feats, axis=3) ## N, T, F, 1 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 41, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: ## N, T, F, 32 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 21, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', # [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * C fdim = conv2.get_shape().dims feat_dim = fdim[2].value * fdim[3].value rnn_input = tf.reshape(conv2, [params.batch_size, -1, feat_dim]) # Permute into time major order for rnn: T, N, F * C rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) fw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # fw_cell_list = [fw_cell] * params.num_rnn_layers # bw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # bw_cell_list = [bw_cell] * params.num_rnn_layers conved_seq_lens = get_rnn_seqlen(seq_lens) rnn_outputs = custom_ops.stacked_brnn(fw_cell, fw_cell, params.num_hidden, params.num_rnn_layers, rnn_input, params.batch_size, conved_seq_lens) _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay( 'weights', [NUM_CLASSES, params.num_hidden * 2], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden * 2]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(session, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # data layout: N, T, F # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[20, 5, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F feats = tf.expand_dims(feats, dim=-1) ## N, T, F, 1 conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[10, 5, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F, 32 conv = tf.nn.conv2d(conv1, kernel, [1, 2, 1, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * 32 rnn_input = tf.reshape(conv2, [params.batch_size, -1, 75 * params.num_filters]) # Permute into time major order for rnn: T, N, F * 32 rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell = custom_ops.CustomRNNCell2(params.num_hidden, use_fp16=params.use_fp16) multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * params.num_rnn_layers) rnn_seq_lens = get_rnn_seqlen(seq_lens) if params.rnn_type == 'uni-dir': rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=True) else: outputs, _ = tf.nn.bidirectional_dynamic_rnn( multi_cell, multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=False) outputs_fw, outputs_bw = outputs rnn_outputs = outputs_fw + outputs_bw _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [NUM_CLASSES, params.num_hidden], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits