def residual_layer(name, l, out_filters, strides, data_format): ch_out = out_filters data_format = get_data_format(data_format, keras_mode=False) ch_dim = 3 if data_format == 'NHWC' else 1 ch_in = _get_dim(l, ch_dim) l_in = l with tf.variable_scope('{}.0'.format(name)): l = BNReLU(l) l = SeparableConv2D('conv1', l, ch_out, 3, strides=strides, activation=BNReLU) l = SeparableConv2D('conv2', l, ch_out, 3) # The second conv need to be BN before addition. l = BatchNorm('bn2', l) shortcut = l_in if strides > 1: shortcut = AvgPooling('pool', shortcut, 2) if ch_in < ch_out: pad_paddings = [[0, 0], [0, 0], [0, 0], [0, 0]] pad_width = (ch_out - ch_in) pad_paddings[ch_dim] = [0, pad_width] shortcut = tf.pad(shortcut, pad_paddings) elif ch_in > ch_out: if data_format == 'NHWC': shortcut1 = shortcut[:, :, :, :ch_out] shortcut2 = shortcut[:, :, :, ch_out:] else: shortcut1 = shortcut[:, :ch_out, :, :] shortcut2 = shortcut[:, ch_out:, :, :] shortcut2 = Conv2D('conv_short', shortcut2, ch_out, 1, strides=strides) shortcut2 = BatchNorm('bn_short', shortcut2) shortcut = shortcut1 + shortcut2 l += shortcut return l
def get_bn(zero_init=False): """ Zero init gamma is good for resnet. See https://arxiv.org/abs/1706.02677. """ if zero_init: return lambda x, name=None: BatchNorm('bn', x, gamma_initializer=tf.zeros_initializer()) else: return lambda x, name=None: BatchNorm('bn', x)
def BNNoReLU(x, name=None): """ A shorthand of BatchNormalization. """ if name is None: x = BatchNorm('bn', x) else: x = BatchNorm(name, x) return x
def AccuracyBoost(x): ''' Accuracy boost block for bottleneck layers. ''' nch = x.get_shape().as_list()[-1] g = GlobalAvgPooling('gpool', x) g = tf.reshape(g, [-1, 1, 1, nch]) wp = tf.nn.sigmoid(BatchNorm('p/bn', g, training=False)) wn = tf.nn.sigmoid(BatchNorm('n/bn', -g, training=False)) return tf.multiply(x, wp + wn, name='res')
def get_bn(zero_init=False): """ Zero init gamma is good for resnet. See https://arxiv.org/abs/1706.02677. Besides, moving average deserves a more careful control """ if zero_init: return lambda x, name: BatchNorm( 'bn', x, gamma_init=tf.zeros_initializer()) else: return lambda x, name: BatchNorm('bn', x)
def batch_norm(scope, name, layer, decay, layer_num, norm_pattern, training): with tf.variable_scope(scope) as s: if training: layer = BatchNorm(name, layer) if layer_num%norm_pattern != 0 else layer else: if decay is not None: layer = BatchNorm(name, layer, decay=decay, use_local_stat=False) if layer_num%norm_pattern != 0 else layer else: layer = BatchNorm(name, layer, use_local_stat=False) if layer_num%norm_pattern != 0 else layer return layer
def conv_block(x, growth_rate, name, freeze): x1 = BatchNorm(name + '_0_bn', x, epsilon=1e-5) x = tf.nn.relu(x, name=name + '_0_relu') x1 = Conv2D(name + '_1_conv', x1, 4 * growth_rate, 1, strides=1, activation=tf.identity, use_bias=False) x1 = BatchNorm(name + '_1_bn', x1, epsilon=1e-5) x = tf.nn.relu(x1, name=name + '_1_relu') x1 = Conv2D(name + '_2_conv', x1, growth_rate, 3, padding='same', use_bias=False) x1 = tf.stop_gradient(x1) if freeze else x1 x = tf.concat([x, x1], 1, name=name + '_concat') return x
def shufflenet_unit_v2(x, out_channels, downsample): in_channels = int(x.shape[1]) mid_channels = out_channels // 2 in_channels2 = in_channels // 2 assert (in_channels % 2 == 0) if downsample: y1, x2 = x, x # shortcut_channel = int(x.shape[1]) # assert (shortcut_channel == int(y1.shape[1])) y1 = depthwise_conv('shortcut_dconv', y1, channels=in_channels, kernel_size=3, strides=2) y1 = BatchNorm('shortcut_dconv_bn', y1) y1 = Conv2D('shortcut_conv', y1, filters=in_channels, kernel_size=1, activation=BNReLU) else: y1, x2 = tf.split(x, 2, axis=1) # shortcut_channel = int(x.shape[1] // 2) # assert (shortcut_channel == int(y1.shape[1])) y2_in_channels = (in_channels if downsample else in_channels2) y2 = Conv2D('conv1', x2, filters=mid_channels, kernel_size=1, activation=BNReLU) y2 = depthwise_conv('dconv', y2, channels=mid_channels, kernel_size=3, strides=(2 if downsample else 1)) y2 = BatchNorm('dconv_bn', y2) y2_out_channels = out_channels - y2_in_channels y2 = Conv2D('conv2', y2, filters=y2_out_channels, kernel_size=1, activation=BNReLU) output = tf.concat([y1, y2], axis=1) output = channel_shuffle(output, 2) return output
def BNLeakyReLU(x, name=None): """ A shorthand of BatchNormalization + LeakyReLU. """ x = BatchNorm('bn', x) x = tf.nn.leaky_relu(x, name=name) return x
def _factorized_reduction(scope_name, x, out_filters, data_format): """Reduces the shape of x without information loss due to striding. copied from https://github.com/melodyguan/enas/blob/master/src/cifar10/general_child.py """ assert out_filters % 2 == 0, ( "Need even number of filters when using this factorized reduction.") #with tf.variable_scope(scope_name): # layer = Conv2D('conv3x3_path', x, out_filters, 3, strides=2, activation=BNReLU) #return layer with tf.variable_scope(scope_name): path1 = AvgPooling('path1', x, pool_size=1, strides=2, padding='valid') path1 = Conv2D('path1_conv', path1, out_filters // 2, 1, padding='same') # Skip path 2 # First pad with 0"s on the right and bottom, then shift the filter to # include those 0"s that were added. data_format = get_data_format(data_format, keras_mode=False) if data_format == "NHWC": pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :] ch_dim = 3 else: pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]] path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:] ch_dim = 1 path2 = AvgPooling('path2', x, pool_size=1, strides=2, padding='valid') path2 = Conv2D('path2_conv', path1, out_filters // 2, 1, padding='same') final_path = tf.concat(values=[path1, path2], axis=ch_dim) final_path = BatchNorm('bn', final_path) return final_path
def get_logits(image, num_classes=1000): # with ssdnet_argscope(): # dropblock if get_current_tower_context().is_training: dropblock_keep_prob = tf.get_variable('dropblock_keep_prob', (), dtype=tf.float32, trainable=False) else: dropblock_keep_prob = None l = image #tf.transpose(image, perm=[0, 2, 3, 1]) # conv1 l = Conv2D('conv1', l, 16, 4, strides=2, activation=None, padding='SAME') with tf.variable_scope('conv1'): l = BNReLU(tf.concat([l, -l], axis=-1)) l = MaxPooling('pool1', l, 2) # conv2 l = LinearBottleneck('conv2', l, 48, 24, 5, t=1, use_ab=True) l = l + LinearBottleneck('conv3', l, 24, 24, 5, t=2, use_ab=True) ch_all = [48, 72, 96] iters = [2, 4, 4] mults = [3, 4, 6] bsize = [3, 3, 3] hlist = [] for ii, (ch, it, mu, bs) in enumerate(zip(ch_all, iters, mults, bsize)): use_ab = (ii < 2) for jj in range(it): name = 'inc{}/{}'.format(ii, jj) stride = 2 if jj == 0 else 1 swap_block = True if jj % 2 == 1 else False l = inception(name, l, ch, stride, t=mu, swap_block=swap_block, use_ab=use_ab) l = DropBlock('inc{}/drop'.format(ii), l, keep_prob=dropblock_keep_prob, block_size=bs) l = Conv2D('convf', l, 96 * 6, 1, activation=None) l = BatchNorm('convf/bn', l) l = tf.nn.relu(l) l = GlobalAvgPooling('poolf', l) fc = FullyConnected('fc', l, 1280, activation=BNReLU) fc = Dropout(fc, keep_prob=0.9) logits = FullyConnected('linear', fc, num_classes, use_bias=True) return logits
def transition_block(x, reduction, name): x = BatchNorm(name + '_bn', x, epsilon=1e-5) x = tf.nn.relu(x, name=name + '_relu') x = Conv2D(name + '_conv', x, int(x.shape[1].value * reduction), 1, use_bias=False) x = AvgPooling(name + '_pool', x, 2, strides=2, padding='SAME') return x
def AccuracyBoost(x): ''' Accuracy boost block for bottleneck layers. ''' nch = x.get_shape().as_list()[-1] g = GlobalAvgPooling('gpool', x) W = tf.get_variable('W', shape=(nch,), initializer=tf.variance_scaling_initializer(2.0)) g = BatchNorm('bn', tf.multiply(g, W)) ab = tf.reshape(tf.nn.sigmoid(g), (-1, 1, 1, nch)) return tf.multiply(x, ab, name='res')
def Norm(x, type, gamma_initializer=tf.constant_initializer(1.)): """ A norm layer (which depends on 'type') Args: type (str): one of "BN" or "GN" """ assert type in ["BN", "GN"] if type == "BN": return BatchNorm('bn', x, gamma_initializer=gamma_initializer) else: return GroupNorm('gn', x, gamma_initializer=gamma_initializer)
def projection_layer(name, layer, out_filters, ch_dim, id_mask_slice=None): with tf.variable_scope(name): n_dim = len(layer.get_shape().as_list()) if n_dim == 4: layer = tf.nn.relu(layer) layer = Conv2D('conv1x1_proj', layer, out_filters, 1, strides=1, activation=tf.identity) layer = BatchNorm('bn_proj', layer) elif n_dim == 2: layer = tf.nn.relu(layer) layer = FullyConnected('fc_proj', layer, out_filters, activation=tf.identity) else: raise ValueError("Projection cannot handle tensor of dim {}".format(n_dim)) return layer
def residual_bottleneck_layer(name, l, out_filters, strides, data_format): data_format = get_data_format(data_format, keras_mode=False) ch_dim = 3 if data_format == 'NHWC' else 1 ch_in = _get_dim(l, ch_dim) ch_base = out_filters ch_last = ch_base * 4 l_in = l with tf.variable_scope('{}.0'.format(name)): l = BatchNorm('bn0', l) l = tf.nn.relu(l) l = (LinearWrap(l) .Conv2D('conv1x1_0', ch_base, 1, activation=BNReLU) .Conv2D('conv3x3_1', ch_base, 3, strides=strides, activation=BNReLU) .Conv2D('conv1x1_2', ch_last, 1)()) l = BatchNorm('bn_3', l) shortcut = l_in if ch_in != ch_last: shortcut = Conv2D('conv_short', shortcut, ch_last, 1, strides=strides) shortcut = BatchNorm('bn_short', shortcut) l = l + shortcut return l
def BNActivation(x, activation_name='relu', name=None): """ A shorthand of BatchNormalization + ReLU. Args: x (tf.Tensor): the input name: deprecated, don't use. """ if name is not None: log_deprecated("BNActivation(name=...)", "The output tensor will be named `output`.") x = BatchNorm('bn', x) activation_fn = activation_list[activation_name] x = activation_fn(x, name=name) return x
def initial_convolution(image, init_channel, s_type='basic', name='init_conv0'): with tf.variable_scope(name): if s_type == 'basic': l = Conv2D('conv0', image, init_channel, 3) elif s_type == 'imagenet': l = (LinearWrap(image) .Conv2D('conv0', init_channel, 7, strides=2, activation=tf.identity) .MaxPooling('pool0', 3, strides=2, padding='same')()) elif s_type == 'conv7': l = Conv2D('conv0_7x7', image, init_channel, 7, strides=2) elif s_type == 'conv3': l = Conv2D('conv0_3x3', image, init_channel, 3, strides=2) else: raise Exception("Unknown starting type (s_type): {}".format(s_type)) l = BatchNorm('init_bn', l) return l
def RegionNorm(x, h_group_num, w_group_num, gamma_initializer=tf.constant_initializer(1.)): # 1. pad so that h % h_group_num == 0, w % w_group_num == 0 orig_shape = x.get_shape().as_list() h, w = orig_shape[1], orig_shape[2] new_h = get_pad_num(h, h_group_num) new_w = get_pad_num(w, w_group_num) x_resized = tf.image.resize_images(x, [new_h, new_w], align_corners=False) # 2. split and stack all grid assert new_h % h_group_num == 0 sub_h = new_h // h_group_num assert new_w % w_group_num == 0 sub_w = new_w // w_group_num sub_grids = [] for i in range(0, new_h, sub_h): for j in range(0, new_w, sub_w): x_sub_grid = x_resized[:, i:i + sub_h, j:j + sub_w, :, None] sub_grids.append(x_sub_grid) sub_grids = tf.concat(sub_grids, axis=4) sub_grids_shape = sub_grids.get_shape().as_list() feed2bn = tf.reshape(sub_grids, [-1, sub_grids_shape[1], sub_grids_shape[2] * sub_grids_shape[3], sub_grids_shape[4]]) # 3. normalization bn_output = BatchNorm('bn', feed2bn, axis=3, gamma_initializer=gamma_initializer, internal_update=True, sync_statistics='nccl') # 4. go back to original shape new_sub_grids = tf.reshape(bn_output, [-1, sub_grids_shape[1], sub_grids_shape[2], sub_grids_shape[3], sub_grids_shape[4]]) counter = 0 new_rows = [] for i in range(0, new_h, sub_h): new_row = [] for j in range(0, new_w, sub_w): new_row.append(new_sub_grids[:, :, :, :, counter]) counter += 1 new_row = tf.concat(new_row, axis=2) new_rows.append(new_row) new_x_resized = tf.concat(new_rows, axis=1) # 5. resize back new_x = tf.image.resize_images(new_x_resized, [h, w], align_corners=False) return new_x
def LinearBottleneck(x, ich, och, kernel, padding='SAME', stride=1, active=None, t=3, use_ab=False, w_init=None): ''' mobilenetv2 linear bottlenet. ''' if active is None: active = True if kernel > 3 else False out = Conv2D('conv_e', x, int(ich*t), 1, activation=BNReLU) if use_ab: out = AccuracyBoost('ab', out) out = DWConv('conv_d', out, kernel, padding, stride, w_init, active) out = Conv2D('conv_p', out, och, 1, activation=None) with tf.variable_scope('conv_p'): out = BatchNorm('bn', out) return out
def DWConv(x, kernel, padding='SAME', stride=1, w_init=None, active=True, data_format='NHWC'): ''' Depthwise conv + BN + (optional) ReLU. We do not use channel multiplier here (fixed as 1). ''' assert data_format in ('NHWC', 'channels_last') channel = x.get_shape().as_list()[-1] if not isinstance(kernel, (list, tuple)): kernel = [kernel, kernel] filter_shape = [kernel[0], kernel[1], channel, 1] if w_init is None: w_init = tf.variance_scaling_initializer(2.0) W = tf.get_variable('W', filter_shape, initializer=w_init) out = tf.nn.depthwise_conv2d(x, W, [1, stride, stride, 1], padding=padding, data_format=data_format) if active is None: return out out = BNReLU(out) if active else BatchNorm('bn', out) return out
def DownsampleBottleneck(x, ich, och, kernel, padding='SAME', stride=2, active=None, t=3, use_ab=False, w_init=None): ''' downsample linear bottlenet. ''' if active is None: active = True if kernel > 3 else False out_e = Conv2D('conv_e', x, ich*t, 1, activation=BNReLU) if use_ab: out_e = AccuracyBoost('ab', out_e) out_d = DWConv('conv_d', out_e, kernel, padding, stride, w_init, active) out_m = DWConv('conv_m', out_e, kernel, padding, stride, w_init, active) out = tf.concat([out_d, out_m], axis=-1) out = Conv2D('conv_p', out, och, 1, activation=None) with tf.variable_scope('conv_p'): out = BatchNorm('bn', out) return out
def feature_to_prediction_and_loss(scope_name, l, label, num_classes, prediction_feature, ch_dim, label_smoothing=0, dense_dropout_keep_prob=1.0, is_last=True): """ Given the feature l at scope_name, compute a classifier. """ with tf.variable_scope(scope_name): n_dim = len(l.get_shape().as_list()) if n_dim == 4 and not is_last: with tf.variable_scope('aux_preprocess'): l = tf.nn.relu(l) l = AvgPooling('pool', l, pool_size=5, strides=3, padding='valid') l = Conv2D('conv_proj', l, 128, 1, strides=1, activation=BNReLU) shape = l.get_shape().as_list() if ch_dim != 1: shape = shape[1:3] else: shape = shape[2:4] l = Conv2D('conv_flat', l, 768, shape, strides=1, padding='valid', activation=BNReLU) l = tf.layers.flatten(l) else: l = BNReLU('bnrelu_pred', l) ch_in = _get_dim(l, ch_dim) if prediction_feature == '1x1': ch_out = ch_in if n_dim == 4: l = Conv2D('conv1x1', l, ch_out, 1) else: assert n_dim == 2, n_dim l = FullyConnected('fc1x1', l, ch_out, activation=tf.identity) l = BNReLU('bnrelu1x1', l) elif prediction_feature == 'msdense': assert n_dim == 2, n_dim ch_inter = ch_in l = Conv2D('conv1x1_0', l, ch_inter, 3, strides=2) l = BNReLU('bnrelu1x1_0', l) l = Conv2D('conv1x1_1', l, ch_inter, 3, strides=2) l = BNReLU('bnrelu1x1_1', l) elif prediction_feature == 'bn': l = BatchNorm('bn', l) else: # Do nothing to the input feature pass if n_dim > 2: l = GlobalAvgPooling('gap', l) variables = [] if num_classes > 0: if is_last: l = Dropout('drop_pre_fc', l, keep_prob=dense_dropout_keep_prob) logits = FullyConnected('linear', l, num_classes, activation=tf.identity) variables.append(logits.variables.W) variables.append(logits.variables.b) tf.nn.softmax(logits, name='preds') ## local cost/error_rate if label_smoothing > 0: one_hot_labels = tf.one_hot(label, num_classes) cost = tf.losses.softmax_cross_entropy(\ onehot_labels=one_hot_labels, logits=logits, label_smoothing=label_smoothing) else: cost = tf.nn.sparse_softmax_cross_entropy_with_logits(\ logits=logits, labels=label) cost = tf.reduce_mean(cost, name='cross_entropy_loss') add_moving_summary(cost) def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'): return tf.cast(tf.logical_not( tf.nn.in_top_k(logits, label, topk)), tf.float32, name=name) wrong = prediction_incorrect(logits, label, 1, name='wrong-top1') add_moving_summary(tf.reduce_mean(wrong, name='train_error')) wrong5 = prediction_incorrect(logits, label, 5, name='wrong-top5') add_moving_summary(tf.reduce_mean(wrong5, name='train-error-top5')) else: # for regression: pred = FullyConnected('linear', l, 1, activation=tf.identity) variables.append(pred.variables.W) variables.append(pred.variables.b) pred = tf.nn.relu(pred) tf.identity(pred, name='preds') cost = tf.reduce_mean(0.5 * (pred - label)**2, name='mean_square_error') add_moving_summary(cost) return cost, variables
def apply_operation(layer, operation, out_filters, strides, data_format): """ Apply primitive operation to a layer tensor. Args: layer (tensor) : tensor of the layer to apply the op to. operation (int from LayerTypes) : operation copied out_filters : number of output filters stride : Strides for the operation if applicable data_format : data format of the tensor input Returns: a Tensor that is the result of the operation. """ ch_dim = _data_format_to_ch_dim(data_format) if operation == LayerTypes.NOT_EXIST: return None elif operation == LayerTypes.IDENTITY: if strides == 1: return tf.identity(layer, name='id') else: return _factorized_reduction('id_reduction', layer, out_filters, data_format) elif operation == LayerTypes.RESIDUAL_LAYER: return residual_layer('res', layer, out_filters, strides, data_format) elif operation == LayerTypes.RESIDUAL_BOTTLENECK_LAYER: return residual_bottleneck_layer('res_btl', layer, out_filters, strides, data_format) elif operation == LayerTypes.CONV_1: layer = tf.nn.relu(layer) layer = Conv2D('conv1x1', layer, out_filters, 1, strides=strides) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.CONV_3: layer = tf.nn.relu(layer) layer = Conv2D('conv3x3', layer, out_filters, 3, strides=strides) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.SEPARABLE_CONV_3: layer = tf.nn.relu(layer) layer = Conv2D('conv1x1', layer, out_filters, 1, strides=1, activation=BNReLU) layer = SeparableConv2D('sep_conv3x3_1', layer, out_filters, 3, strides=strides) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.SEPARABLE_CONV_5: layer = tf.nn.relu(layer) layer = Conv2D('conv1x1', layer, out_filters, 1, strides=1, activation=BNReLU) layer = SeparableConv2D('sep_conv5x5_1', layer, out_filters, 5, strides=strides) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.SEPARABLE_CONV_3_2: layer = tf.nn.relu(layer) layer = SeparableConv2D( 'sep_conv3x3_1', layer, out_filters, 3, strides=strides, activation=BNReLU) layer = SeparableConv2D('sep_conv3x3_2', layer, out_filters, 3, strides=1) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.SEPARABLE_CONV_5_2: layer = tf.nn.relu(layer) layer = SeparableConv2D( 'sep_conv5x5_1', layer, out_filters, 5, strides=strides, activation=BNReLU) layer = SeparableConv2D('sep_conv5x5_2', layer, out_filters, 5, strides=1) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.SEPARABLE_CONV_7_2: layer = tf.nn.relu(layer) layer = SeparableConv2D( 'sep_conv7x7_1', layer, out_filters, 7, strides=strides, activation=BNReLU) layer = SeparableConv2D('sep_conv7x7_2', layer, out_filters, 7, strides=1) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.DILATED_CONV_3: if strides > 1: layer = tf.nn.relu(layer) layer = _factorized_reduction( 'dil_reduction', layer, out_filters, data_format) layer = tf.nn.relu(layer) layer = SeparableConv2D( 'dil_conv3x3', layer, out_filters, 3, strides=1, dilation_rate=(2, 2)) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.DILATED_CONV_5: if strides > 1: layer = tf.nn.relu(layer) layer = _factorized_reduction( 'dil_reduction', layer, out_filters, data_format) layer = tf.nn.relu(layer) layer = SeparableConv2D( 'dil_conv5x5', layer, out_filters, 5, strides=1, dilation_rate=(2, 2)) layer = BatchNorm('bn', layer) return layer elif operation == LayerTypes.MAXPOOL_3x3: layer = MaxPooling('maxpool', layer, pool_size=3, strides=strides, padding='same') ch_in = _get_dim(layer, ch_dim) if ch_in != out_filters: projection_layer('proj_maxpool', layer, out_filters, ch_dim) return layer elif operation == LayerTypes.AVGPOOL_3x3: layer = AvgPooling('avgpool', layer, pool_size=3, strides=strides, padding='same') ch_in = _get_dim(layer, ch_dim) if ch_in != out_filters: projection_layer('proj_avgpool', layer, out_filters, ch_dim) return layer elif operation in [ LayerTypes.GATED_LAYER, LayerTypes.ANTI_GATED_LAYER, LayerTypes.NO_FORWARD_LAYER]: if strides > 1: layer = _factorized_reduction( 'gate_reduction', layer, out_filters, data_format) # this is important for computing hallucination statistics. pre_gate_layer = layer if operation == LayerTypes.GATED_LAYER: layer = finalized_gated_layer('gated_layer', layer) elif operation == LayerTypes.NO_FORWARD_LAYER: layer = candidate_gated_layer('gated_layer', layer) else: layer = finalized_gated_layer('anti_gated_layer', layer, init_val=1.0) layer.pre_gate_layer = pre_gate_layer return layer elif operation == LayerTypes.FullyConnected: layer = FullyConnected('fully_connect', layer, out_filters) layer = tf.nn.relu(layer, 'relu') return layer elif operation in [ LayerTypes.FC_TANH_MUL_GATE, LayerTypes.FC_SGMD_MUL_GATE, LayerTypes.FC_RELU_MUL_GATE, LayerTypes.FC_IDEN_MUL_GATE]: ht = FullyConnected( 'fully_connect', layer, 2 * out_filters, activation=tf.identity, use_bias=False) ch_dim = 1 h, t = tf.split(ht, 2, axis=ch_dim) t = tf.sigmoid(t) if operation == LayerTypes.FC_TANH_MUL_GATE: h = tf.tanh(h) elif operation == LayerTypes.FC_SGMD_MUL_GATE: h = tf.sigmoid(h) elif operation == LayerTypes.FC_RELU_MUL_GATE: h = tf.nn.relu(h) elif operation == LayerTypes.FC_IDEN_MUL_GATE: h = tf.identity(h) # add residual if _get_dim(layer, ch_dim) != out_filters: layer = FullyConnected( 'proj_prev', layer, out_filters, activation=tf.identity) layer = layer + t * (h - layer) return layer elif operation == LayerTypes.MLP_RESIDUAL_LAYER: raise NotImplementedError("MLP residual layer is not yet implemented") else: raise NotImplementedError("Have not implemented operation {}".format(operation))
def get_bn(zero_init=False): if zero_init: return lambda x, name: BatchNorm( 'bn', x, gamma_init=tf.zeros_initializer()) else: return lambda x, name: BatchNorm('bn', x)
def build_graph(self, input_w, input_c, label, x_len, max_len): w2v_model = gensim.models.KeyedVectors.load_word2vec_format( EMBED_LOC, binary=False) word_embed_init = word_embed(self.word_vocab, w2v_model) char_embed_init = tf.eye(CHAR_EMBED_DIM, dtype=tf.float32) _word_embeddings = tf.get_variable('word_embeddings', initializer=word_embed_init, trainable=True, regularizer=self.regularizer) _char_embeddings = tf.get_variable('char_embeddings', initializer=char_embed_init, trainable=True, regularizer=self.regularizer) n_word = tf.get_variable('negation_word_embedding', initializer=self.n_w, trainable=True, regularizer=self.regularizer) i_word = tf.get_variable('intensifier_word_embedding', initializer=self.i_w, trainable=True, regularizer=self.regularizer) s_word = tf.get_variable('sentiment_word_embedding', initializer=self.s_w, trainable=True, regularizer=self.regularizer) # OOV pad word_zero_pad = tf.zeros([1, WORD_EMBED_DIM]) char_zero_pad = tf.zeros([1, CHAR_EMBED_DIM]) word_embeddings = tf.concat([word_zero_pad, _word_embeddings], axis=0) char_embeddings = tf.concat([char_zero_pad, _char_embeddings], axis=0) word_embeded = tf.nn.embedding_lookup(word_embeddings, input_w) _char_embeded = tf.nn.embedding_lookup(char_embeddings, input_c) _char_embeded = tf.reshape( _char_embeded, [BATCH_SIZE, max_len, MAX_WORD_LEN, CHAR_EMBED_DIM]) negation_c = tf.get_variable('negation_char', initializer=self.n_c, trainable=False, dtype=tf.int32) intensifier_c = tf.get_variable('intensifier_char', initializer=self.i_c, trainable=False, dtype=tf.int32) sentiment_c = tf.get_variable('sentiment_char', initializer=self.s_c, trainable=False, dtype=tf.int32) n_c_embeded = tf.nn.embedding_lookup(char_embeddings, negation_c) i_c_embeded = tf.nn.embedding_lookup(char_embeddings, intensifier_c) s_c_embeded = tf.nn.embedding_lookup(char_embeddings, sentiment_c) n_c_embeded = tf.expand_dims(n_c_embeded, axis=0) i_c_embeded = tf.expand_dims(i_c_embeded, axis=0) s_c_embeded = tf.expand_dims(s_c_embeded, axis=0) @auto_reuse_variable_scope def conv1(x): return Conv2D('conv1x1', x, 1, (1, 1), padding='same') @auto_reuse_variable_scope def conv2(x): return Conv2D('conv2gram', x, 150, (2, 20), strides=(1, 20), padding='same') @auto_reuse_variable_scope def conv3(x): return Conv2D('conv3gram', x, 150, (3, 20), strides=(1, 20), padding='same') _char = conv1(_char_embeded) _char_2 = conv2(_char) _char_3 = conv3(_char) char_embeded = tf.concat([_char_2, _char_3], axis=-1) char_embeded = tf.squeeze(char_embeded, [2]) context_word_repre = tf.concat([word_embeded, char_embeded], axis=-1) n_c_embeded = conv1(n_c_embeded) n_char_2 = conv2(n_c_embeded) n_char_3 = conv3(n_c_embeded) n_char_embeded = tf.concat([n_char_2, n_char_3], axis=-1) n_char_embeded = tf.squeeze(n_char_embeded, [2]) negation_word_repre = tf.concat( [tf.expand_dims(n_word, 0), n_char_embeded], axis=-1) i_c_embeded = conv1(i_c_embeded) i_char_2 = conv2(i_c_embeded) i_char_3 = conv3(i_c_embeded) i_char_embeded = tf.concat([i_char_2, i_char_3], axis=-1) i_char_embeded = tf.squeeze(i_char_embeded, [2]) intensifier_word_repre = tf.concat( [tf.expand_dims(i_word, 0), i_char_embeded], axis=-1) s_c_embeded = conv1(s_c_embeded) s_char_2 = conv2(s_c_embeded) s_char_3 = conv3(s_c_embeded) s_char_embeded = tf.concat([s_char_2, s_char_3], axis=-1) s_char_embeded = tf.squeeze(s_char_embeded, [2]) sentiment_word_repre = tf.concat( [tf.expand_dims(s_word, 0), s_char_embeded], axis=-1) context_word_repre = tf.expand_dims(context_word_repre, -1) negation_word_repre = tf.expand_dims(negation_word_repre, -1) intensifier_word_repre = tf.expand_dims(intensifier_word_repre, -1) sentiment_word_repre = tf.expand_dims(sentiment_word_repre, -1) context_word_repre = BatchNorm('norm_c', context_word_repre, axis=3) negation_word_repre = BatchNorm('norm_n', negation_word_repre, axis=3) intensifier_word_repre = BatchNorm('norm_i', intensifier_word_repre, axis=3) sentiment_word_repre = BatchNorm('norm_s', sentiment_word_repre, axis=3) negation_word_repre = tf.squeeze( tf.tile(negation_word_repre, [BATCH_SIZE, 1, 1, 1]), [3]) intensifier_word_repre = tf.squeeze( tf.tile(intensifier_word_repre, [BATCH_SIZE, 1, 1, 1]), [3]) sentiment_word_repre = tf.squeeze( tf.tile(sentiment_word_repre, [BATCH_SIZE, 1, 1, 1]), [3]) context_word_repre = tf.squeeze(context_word_repre, [3]) m_s = tf.matmul(context_word_repre, sentiment_word_repre, transpose_b=True) m_i = tf.matmul(context_word_repre, intensifier_word_repre, transpose_b=True) m_n = tf.matmul(context_word_repre, negation_word_repre, transpose_b=True) x_s = tf.matmul(context_word_repre, m_s, transpose_a=True) x_i = tf.matmul(context_word_repre, m_i, transpose_a=True) x_n = tf.matmul(context_word_repre, m_n, transpose_a=True) x_c_s = tf.matmul(sentiment_word_repre, m_s, transpose_a=True, transpose_b=True) x_c_i = tf.matmul(intensifier_word_repre, m_i, transpose_a=True, transpose_b=True) x_c_n = tf.matmul(negation_word_repre, m_n, transpose_a=True, transpose_b=True) x_c = x_c_s + x_c_i + x_c_n x_c = tf.reshape(x_c, [BATCH_SIZE, -1, 600]) x_s = tf.reshape(x_s, [BATCH_SIZE, -1, 600]) x_i = tf.reshape(x_i, [BATCH_SIZE, -1, 600]) x_n = tf.reshape(x_n, [BATCH_SIZE, -1, 600]) # h_c = tf.keras.layers.CuDNNGRU(RNN_DIM, return_sequences=True, name='gru_c')(x_c) # h_s = tf.keras.layers.CuDNNGRU(RNN_DIM, return_sequences=True, name='gru_s')(x_s) # h_i = tf.keras.layers.CuDNNGRU(RNN_DIM, return_sequences=True, name='gru_i')(x_i) # h_n = tf.keras.layers.CuDNNGRU(RNN_DIM, return_sequences=True, name='gru_n')(x_n) c_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.GRUCell( RNN_DIM, name='c_GRU'), output_keep_prob=0.5) s_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.GRUCell( RNN_DIM, name='s_GRU'), output_keep_prob=0.5) i_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.GRUCell( RNN_DIM, name='i_GRU'), output_keep_prob=0.5) n_cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.GRUCell( RNN_DIM, name='n_GRU'), output_keep_prob=0.5) h_c, _ = tf.nn.dynamic_rnn(c_cell, x_c, sequence_length=x_len, dtype=tf.float32) h_s, _ = tf.nn.dynamic_rnn(s_cell, x_s, sequence_length=x_len, dtype=tf.float32) h_i, _ = tf.nn.dynamic_rnn(i_cell, x_i, sequence_length=x_len, dtype=tf.float32) h_n, _ = tf.nn.dynamic_rnn(n_cell, x_n, sequence_length=x_len, dtype=tf.float32) q_s = tf.reduce_mean(h_s, 1, keepdims=True) q_i = tf.reduce_mean(h_i, 1, keepdims=True) q_n = tf.reduce_mean(h_n, 1, keepdims=True) sentiment_att_weights = tf.nn.softmax( tf.reshape( tf.matmul(tf.tanh(h_c), tf.reshape(q_s, [BATCH_SIZE, RNN_DIM, 1])), [BATCH_SIZE, max_len])) o_1 = tf.reshape( tf.matmul( tf.reshape(sentiment_att_weights, [BATCH_SIZE, 1, max_len]), h_c), [BATCH_SIZE, RNN_DIM]) intensifier_att_weights = tf.nn.softmax( tf.reshape( tf.matmul(tf.tanh(h_c), tf.reshape(q_i, [BATCH_SIZE, RNN_DIM, 1])), [BATCH_SIZE, max_len])) o_2 = tf.reshape( tf.matmul( tf.reshape(intensifier_att_weights, [BATCH_SIZE, 1, max_len]), h_c), [BATCH_SIZE, RNN_DIM]) negation_att_weights = tf.nn.softmax( tf.reshape( tf.matmul(tf.tanh(h_c), tf.reshape(q_n, [BATCH_SIZE, RNN_DIM, 1])), [BATCH_SIZE, max_len])) o_3 = tf.reshape( tf.matmul( tf.reshape(negation_att_weights, [BATCH_SIZE, 1, max_len]), h_c), [BATCH_SIZE, RNN_DIM]) output = tf.concat([o_1, o_2, o_3], axis=-1) w = tf.get_variable('w', [3 * RNN_DIM, 2], initializer=tf.contrib.layers.xavier_initializer()) b = tf.get_variable('b', initializer=np.zeros([2]).astype(np.float32)) p_pred = tf.nn.xw_plus_b(output, w, b) p_pred = Dropout(p_pred, keep_prob=0.5) labels = tf.one_hot(label, 2, axis=-1, dtype=tf.int32, name='onehot_label') supervised_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=p_pred, labels=labels)) l2_loss = tf.contrib.layers.apply_regularization( self.regularizer, tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) diversity_loss = tf.norm(tf.matmul(output, output, transpose_b=True) - psi * tf.eye(BATCH_SIZE), ord='fro', axis=[-2, -1]) loss = supervised_loss + l2_loss + mu * diversity_loss loss = tf.identity(loss, "total_loss") y_pred = tf.argmax(tf.nn.softmax(p_pred), axis=1, output_type=tf.int32) label_ = tf.reshape(label, [ BATCH_SIZE, ]) accuracy_ = tf.cast(tf.equal(y_pred, label_), tf.float32, name='accu') mean_accuracy = tf.reduce_mean(accuracy_) summary.add_moving_summary(loss, mean_accuracy) return loss
def construct_layer( name, layer_dict, layer_info, out_filters, strides, data_format, stop_gradient=None, op_to_cell=None, drop_path_func=None, non_input_layer_idx=None, hid_to_fs_params=None, l_hallu_costs=None, bn_after_merge=False): """ Args: name (str) : name of this layer to construct layer_dict (dict) : a map from layer id to layer tenors. layer_info (LayerInfo): the layer that we are to construct out_filters : output number of filters. strides : whether to take a strides in the operations data_format : 'channel_first' or 'channel_last' stop_gradient : whether to stop gradient on inputs op_to_cell : a dict from op name (cell name) to the cell object. drop_path_func : a function object for computing drop path non_input_layer_idx : index for computing drop path with for cell based search hid_to_fs_params : a map from hallu id to tuples of params for feature selection l_hallu_costs : a list to update, in order to contain the costs incurred by hallu. bn_after_merge (bool) : whether to batch normalize after the merge op. Usually cnn uses False, and rnn uses True. Side Effects on inputs: 1. Update l_hallu_costs, if the constructed layer triggers hallu costs. 2. Other inputs are unaffected. Return: a tensor that represents layer """ if op_to_cell is not None: # cell network has merge_op as the cell name (str) cell = op_to_cell.get(layer_info.merge_op, None) if cell: inputs = [layer_dict[in_id] for in_id in layer_info.inputs] layer = cell(inputs, out_filters, strides, non_input_layer_idx, name) return layer with tf.variable_scope(name): ch_dim = _data_format_to_ch_dim(data_format) n_inputs = len(layer_info.inputs) new_layer = [] # stride may be a val for each input. if isinstance(strides, list): if len(strides) != n_inputs: raise ValueError("Confusing strides at info {}".format(layer_info)) l_strides = strides else: l_strides = [strides] * n_inputs # stop gradient may be a val for each input if not isinstance(stop_gradient, list): l_to_stop = [stop_gradient] * n_inputs # operation names for children ops_ids = None if layer_info.extra_dict is not None: ops_ids = layer_info.extra_dict.get('ops_ids', None) ops_ids = ops_ids if ops_ids else list(range(n_inputs)) for input_i, layer_id, operation, strides, to_stop in zip( ops_ids, layer_info.inputs, layer_info.input_ops, l_strides, l_to_stop): layer = layer_dict[layer_id] scope = "op_{}".format(input_i) with tf.variable_scope(scope): if to_stop == LayerTypes.STOP_GRADIENT_HARD: layer = tf.stop_gradient(layer) elif to_stop == LayerTypes.STOP_GRADIENT_SOFT: layer = finalized_gated_layer('soft_stop', layer) elif to_stop != LayerTypes.STOP_GRADIENT_NONE: raise ValueError("Unknown stop_gradient value from info {}".format( layer_info)) layer = apply_operation( layer, operation, out_filters, strides, data_format) if LayerTypes.do_drop_path(operation) and drop_path_func: layer = drop_path_func(layer, non_input_layer_idx) new_layer.append(layer) if len(new_layer) == 1: layer = new_layer[0] elif len(new_layer) > 1: LT = LayerTypes merge_op = layer_info.merge_op if merge_op in [LT.MERGE_WITH_CAT_PROJ, LT.MERGE_WITH_CAT]: layer = tf.concat(new_layer, axis=ch_dim, name='cat_feats') if bn_after_merge: layer = BatchNorm('bn_after_merge', layer) if merge_op == LT.MERGE_WITH_CAT_PROJ: layer = projection_layer('post_cat', layer, out_filters, ch_dim) else: # The merge op is not concat-like, so we make sure all inputs are of the # same channel first. ch_ref = 0 for layer in new_layer: ch_ref = max(_get_dim(layer, ch_dim), ch_ref) # The following block project the new layers # to be of the same channel (ch_ref), # so they can be add/mul together. # However, if the new layer is a gated layer, # we need to project the input of # the gated layer instead, because we need the shape # of the sum tensor and its gated inputs to have the # same shape for gradient/tensor comparison. The gate # needs to be right before the sum for the gradient/tensor # to be non-trivial. for li, layer in enumerate(new_layer): if _get_dim(layer, ch_dim) != ch_ref: if hasattr(layer, 'pre_gate_layer'): pre_gate_layer = layer.pre_gate_layer layer = projection_layer( 'pre_sum_{}'.format(li), pre_gate_layer, ch_ref, ch_dim) with tf.variable_scope('pre_sum_gate_{}'.format(li)): new_layer[li] = apply_operation( layer, layer_info.input_ops[li], ch_ref, 1, data_format) # since the apply_operation is definitely no_param, # it does not require drop path. else: new_layer[li] = projection_layer( 'pre_sum_{}'.format(li), layer, ch_ref, ch_dim) if merge_op in [LT.MERGE_WITH_SUM, LT.MERGE_WITH_AVG]: layer = tf.add_n(new_layer, name='sum_feats') if merge_op == LT.MERGE_WITH_AVG: layer = tf.div( layer, np.float32(len(new_layer)), name='avg_feats') elif merge_op == LT.MERGE_WITH_WEIGHTED_SUM: if layer_info.is_candidate: omega, l1_lambda = hid_to_fs_params[layer_info.is_candidate] layer = feature_selection_layer( 'feature_select', new_layer, omega, l1_lambda, l_hallu_costs) else: if layer_info.extra_dict is None: fs_omega = None else: fs_omega = layer_info.extra_dict.get('fs_omega', None) layer = weighted_sum_layer( 'weighted_sum', new_layer, fs_omega=fs_omega) elif merge_op == LT.MERGE_WITH_MUL: layer = new_layer[0] for idx, layer2 in enumerate(new_layer[1:]): layer = tf.multiply(layer, layer2, name='mul_{}'.format(idx)) elif merge_op == LT.MERGE_WITH_SOFTMAX: logits = [] for li, layer in enumerate(new_layer): with tf.variable_scope('path_choice_{}'.format(li)): n_dim = len(layer.get_shape().as_list()) if n_dim > 2: layer = GlobalAvgPooling('softmax_gap', layer) #batch x ch_ref logit = FullyConnected( 'softmax_linear', layer, 1, activation=tf.identity) logit = tf.reshape(logit, [-1]) # batch logits.append(logit) logits = tf.stack(logits, axis=1) # batch x len(new_layer) probs = tf.nn.softmax(logits, axis=1) # batch for li, layer in enumerate(new_layer): new_layer[li] = probs[:, li] * layer layer = tf.add_n(new_layer, name='sum_feats') else: raise ValueError("Unknown merge operation in info {}".format( layer_info)) # batch normalization for all non-concat-based merges. if bn_after_merge: layer = BatchNorm('bn_after_merge', layer) # end else for concat vs non-cat merges. else: raise ValueError("Layer {} has empty input edges. The info: {}".format( name, layer_info)) return layer
def parse(l, ch_out, stride, channel_wise=True): shape = l.get_shape().as_list() l_ori = l parse1 = tf.nn.sigmoid(Conv('parse1', l, 1, 1, strides=1)) l1_left = l * parse1 l = l - l1_left parse2 = tf.nn.sigmoid(Conv('parse2', l, 1, 1, strides=1)) l2_left = l * parse2 l = l - l2_left parse3 = tf.nn.sigmoid(Conv('parse3', l, 1, 1, strides=1)) l3_left = l * parse3 l3_right = l - l3_left l1_left = tf.keras.backend.resize_images( Conv2D('l1_left', AvgPooling('pool', l1_left, pool_size=8, strides=8, padding='VALID'), 1 * ch_out // 4, 3 if shape[1] // 8 > 2 else 1, strides=1), 8 // stride, 8 // stride, 'channels_last') l2_left = tf.keras.backend.resize_images( Conv2D('l2_left', AvgPooling('pool', l2_left, pool_size=4, strides=4, padding='VALID'), 1 * ch_out // 4, 3 if shape[1] // 4 > 2 else 1, strides=1), 4 // stride, 4 // stride, 'channels_last') l3_left = tf.keras.backend.resize_images( Conv2D('l3_left', AvgPooling('pool', l3_left, pool_size=2, strides=2, padding='VALID'), 1 * ch_out // 4, 3 if shape[1] // 2 > 2 else 1, strides=1), 2 // stride, 2 // stride, 'channels_last') l3_right = Conv2D('l3_right', l3_right, 1 * ch_out // 4, 3 if shape[1] > 2 else 1, strides=stride) l_ori = Conv2D('l_ori', l_ori, ch_out // 4, 3, strides=stride, activation=BNReLU) l = tf.concat([ tf.nn.sigmoid(BatchNorm('bn1', l1_left)) * l_ori, tf.nn.sigmoid(BatchNorm('bn2', l2_left)) * l_ori, tf.nn.sigmoid(BatchNorm('bn3', l3_left)) * l_ori, tf.nn.sigmoid(BatchNorm('bn4', l3_right)) * l_ori ], -1) return l
def BNOnly(x): return BatchNorm('bn', x)
def norm_func(self, x, name, gamma_initializer=tf.constant_initializer(1.)): return BatchNorm(name + '_bn', x, gamma_initializer=gamma_initializer)