示例#1
0
    def train_mode():
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x,
                                                           gamma,
                                                           beta,
                                                           epsilon=eps,
                                                           is_training=True)
        moving_sigma = tf.sqrt(moving_var, 'sigma')
        r = tf.stop_gradient(
            tf.clip_by_value(tf.sqrt(batch_var / moving_var), 1.0 / rmax,
                             rmax))
        d = tf.stop_gradient(
            tf.clip_by_value((batch_mean - moving_mean) / moving_sigma, -dmax,
                             dmax))
        xn = xn * r + d

        #update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
        update_op1 = moving_averages.assign_moving_average(moving_mean,
                                                           batch_mean,
                                                           decay,
                                                           zero_debias=False,
                                                           name='mean_ema_op')
        update_op2 = moving_averages.assign_moving_average(moving_var,
                                                           batch_var,
                                                           decay,
                                                           zero_debias=False,
                                                           name='var_ema_op')
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='bn')
示例#2
0
def freeze_affine_getter(getter, *args, **kwargs):
    # custom getter to freeze affine params inside bn
    name = args[0] if len(args) else kwargs.get('name')
    if name.endswith('/gamma') or name.endswith('/beta'):
        kwargs['trainable'] = False
        ret = getter(*args, **kwargs)
        add_model_variable(ret)
    else:
        ret = getter(*args, **kwargs)
    return ret
示例#3
0
 def custom_getter(getter, *args, **kwargs):
     trainable = kwargs.get('trainable', True)
     name = args[0] if len(args) else kwargs.get('name')
     if skip_collection:
         kwargs['trainable'] = False
     v = getter(*args, **kwargs)
     if skip_collection:
         add_model_variable(v)
     if trainable and stop_gradient:
         v = tf.stop_gradient(v, name='freezed_' + name)
     return v
示例#4
0
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    # TODO is there a way to use zero_debias in multi-GPU?
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
    add_model_variable(moving_mean)
    add_model_variable(moving_var)
    with tf.control_dependencies([update_op1, update_op2]):
        return tf.identity(xn, name='output')
示例#5
0
        def update_bn_ema(output, batch_mean, batch_var, moving_mean, moving_var, decay):
            from tensorflow.contrib.framework import add_model_variable
            from tensorflow.python.training import moving_averages

            update_op1 = moving_averages.assign_moving_average(moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op')
            update_op2 = moving_averages.assign_moving_average(moving_var, batch_var, decay, zero_debias=False, name='var_ema_op')
            add_model_variable(moving_mean)
            add_model_variable(moving_var)

            # seems faster than delayed update, but might behave otherwise in distributed settings.
            tf.compat.v1.add_to_collections(tf.compat.v1.GraphKeys.UPDATE_OPS, update_op1)
            tf.compat.v1.add_to_collections(tf.compat.v1.GraphKeys.UPDATE_OPS, update_op2)
            return tf.identity(output)
示例#6
0
        def _update_bn_ema(_xn, _batch_mean, _batch_var, _moving_mean, _moving_var, _decay):

            _update_op1 = moving_averages.assign_moving_average(
                _moving_mean, _batch_mean, _decay, zero_debias=False,
                name='mean_ema_op')
            _update_op2 = moving_averages.assign_moving_average(
                _moving_var, _batch_var, _decay, zero_debias=False,
                name='var_ema_op')
            add_model_variable(moving_mean)
            add_model_variable(moving_var)

            # seems faster than delayed update, but might behave otherwise in distributed settings.
            with tf.control_dependencies([_update_op1, _update_op2]):
                return tf.identity(xn, name='output')
示例#7
0
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    # TODO is there a way to use zero_debias in multi-GPU?
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
    add_model_variable(moving_mean)
    add_model_variable(moving_var)

    # seems faster than delayed update, but might behave otherwise in distributed settings.
    with tf.control_dependencies([update_op1, update_op2]):
        return tf.identity(xn, name='output')
示例#8
0
    def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
        from tensorflow.contrib.framework import add_model_variable
        # TODO is there a way to use zero_debias in multi-GPU?
        update_op1 = moving_averages.assign_moving_average(
            moving_mean, batch_mean, decay, zero_debias=False,
            name='mean_ema_op')
        update_op2 = moving_averages.assign_moving_average(
            moving_var, batch_var, decay, zero_debias=False,
            name='var_ema_op')
        add_model_variable(moving_mean)
        add_model_variable(moving_var)

        # seems faster than delayed update, but might behave otherwise in distributed settings.
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='output')
示例#9
0
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    # TODO is there a way to use zero_debias in multi-GPU?
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
    # Only add to model var when we update them
    add_model_variable(moving_mean)
    add_model_variable(moving_var)

    # TODO add an option, and maybe enable it for replica mode?
    # with tf.control_dependencies([update_op1, update_op2]):
    # return tf.identity(xn, name='output')
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)
    return xn
示例#10
0
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    # TODO is there a way to use zero_debias in multi-GPU?
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
    add_model_variable(moving_mean)
    add_model_variable(moving_var)

    # seems faster than delayed update, but might behave otherwise in distributed settings.
    # TODO add an option, and maybe enable it for replica mode?
    # with tf.control_dependencies([update_op1, update_op2]):
    # return tf.identity(xn, name='output')
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)
    return xn
示例#11
0
    def update_bn_ema(self, xn, batch_mean, batch_var, moving_mean, moving_var,
                      decay):
        update_op1 = moving_averages.assign_moving_average(moving_mean,
                                                           batch_mean,
                                                           decay,
                                                           zero_debias=False,
                                                           name='mean_ema_op')
        update_op2 = moving_averages.assign_moving_average(moving_var,
                                                           batch_var,
                                                           decay,
                                                           zero_debias=False,
                                                           name='var_ema_op')
        # Only add to model var when we update them
        add_model_variable(moving_mean)
        add_model_variable(moving_var)

        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)
        return xn
def update_ema(xn, moving_max, moving_min, decay):
    batch_max = tf.reduce_max(xn, axis=[0, 1, 2])
    batch_min = tf.reduce_min(xn, axis=[0, 1, 2])
    update_op1 = moving_averages.assign_moving_average(moving_max,
                                                       batch_max,
                                                       decay,
                                                       zero_debias=False,
                                                       name='max_ema_op')
    update_op2 = moving_averages.assign_moving_average(moving_min,
                                                       batch_min,
                                                       decay,
                                                       zero_debias=False,
                                                       name='min_ema_op')
    # Only add to model var when we update them
    add_model_variable(moving_min)
    add_model_variable(moving_max)

    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)

    return xn
示例#13
0
def QuantizedWeight(name, x, n, nbit=2):
    """
    Quantize weight.
    Args:
        x (tf.Tensor): a 4D tensor.
            Must have known number of channels, but can have other unknown dimensions.
        name (str): operator's name.
        n (int or double): variance of weight initialization.
        nbit (int): number of bits of quantized weight. Defaults to 2.
    Returns:
        tf.Tensor with attribute `variables`.
    Variable Names:
    * ``basis``: basis of quantized weight.
    Note:
        About multi-GPU training: moving averages across GPUs are not aggregated.
        Batch statistics are computed by main training tower. This is consistent with most frameworks.
    """
    num_filters = x.get_shape().as_list()[-1]

    init_basis = []
    base = NORM_PPF_0_75 * ((2. / n)**0.5) / (2**(nbit - 1))
    for j in range(nbit):
        init_basis.append([(2**j) * base for i in range(num_filters)])
    init_basis = tf.constant_initializer(init_basis)

    bit_dims = [nbit, num_filters]
    num_levels = 2**nbit
    delta = EPS

    # initialize level multiplier
    init_level_multiplier = []
    for i in range(num_levels):
        level_multiplier_i = [0. for j in range(nbit)]
        level_number = i
        for j in range(nbit):
            binary_code = level_number % 2
            if binary_code == 0:
                binary_code = -1
            level_multiplier_i[j] = float(binary_code)
            level_number = level_number // 2
        init_level_multiplier.append(level_multiplier_i)

    # initialize threshold multiplier
    init_thrs_multiplier = []
    for i in range(1, num_levels):
        thrs_multiplier_i = [0. for j in range(num_levels)]
        thrs_multiplier_i[i - 1] = 0.5
        thrs_multiplier_i[i] = 0.5
        init_thrs_multiplier.append(thrs_multiplier_i)

    with tf.variable_scope(name):
        basis = tf.get_variable('basis',
                                bit_dims,
                                tf.float32,
                                initializer=init_basis,
                                trainable=False)
        level_codes = tf.constant(init_level_multiplier)
        thrs_multiplier = tf.constant(
            init_thrs_multiplier
        )  # ValueError: Cannot create a tensor proto whose content is larger than 2GB.
        sum_multiplier = tf.constant(
            1., shape=[1, tf.reshape(x, [-1, num_filters]).get_shape()[0]])
        sum_multiplier_basis = tf.constant(1., shape=[1, nbit])

        # calculate levels and sort
        levels = tf.matmul(level_codes, basis)
        levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)
        levels = tf.reverse(levels, [-1])
        sort_id = tf.reverse(sort_id, [-1])
        levels = tf.transpose(levels, [1, 0])
        sort_id = tf.transpose(sort_id, [1, 0])

        # calculate threshold
        thrs = tf.matmul(thrs_multiplier, levels)

        # calculate level codes per channel
        reshape_x = tf.reshape(x, [-1, num_filters])
        level_codes_channelwise_dims = tf.stack(
            [num_levels * num_filters, nbit])
        level_codes_channelwise = tf.fill(level_codes_channelwise_dims, 0.)
        for i in range(num_levels):
            eq = tf.equal(sort_id, i)
            level_codes_channelwise = tf.where(
                tf.reshape(eq, [-1]), level_codes_channelwise + level_codes[i],
                level_codes_channelwise)
        level_codes_channelwise = tf.reshape(level_codes_channelwise,
                                             [num_levels, num_filters, nbit])

        # calculate output y and its binary code
        y = tf.zeros_like(x) + levels[0]  # output
        zero_dims = tf.stack([tf.shape(reshape_x)[0] * num_filters, nbit])
        bits_y = tf.fill(zero_dims, -1.)
        zero_y = tf.zeros_like(x)
        zero_bits_y = tf.fill(zero_dims, 0.)
        zero_bits_y = tf.reshape(zero_bits_y, [-1, num_filters, nbit])
        for i in range(num_levels - 1):
            g = tf.greater(x, thrs[i])
            y = tf.where(g, zero_y + levels[i + 1], y)
            bits_y = tf.where(
                tf.reshape(g, [-1]),
                tf.reshape(zero_bits_y + level_codes_channelwise[i + 1],
                           [-1, nbit]), bits_y)
        bits_y = tf.reshape(bits_y, [-1, num_filters, nbit])

        ctx = get_current_tower_context()  # current tower context
        # training
        if ctx.is_main_training_tower:
            BT = tf.transpose(bits_y, [2, 0, 1])
            # calculate BTxB
            BTxB = []
            for i in range(nbit):
                for j in range(nbit):
                    BTxBij = tf.multiply(BT[i], BT[j])
                    BTxBij = tf.matmul(sum_multiplier, BTxBij)
                    if i == j:
                        mat_one = tf.ones([1, num_filters])
                        BTxBij = BTxBij + (delta * mat_one)  # + E
                    BTxB.append(BTxBij)
            BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit, num_filters])

            # calculate inverse of BTxB
            if nbit > 2:
                BTxB_transpose = tf.transpose(BTxB, [2, 0, 1])
                # 1) naive
                # BTxB_inv = tf.matrix_inverse(BTxB_transpose)
                # 2) try, except
                try:
                    BTxB_inv = tf.matrix_inverse(BTxB_transpose,
                                                 adjoint=None,
                                                 name=None)
                except:
                    BTxB_ttt = tf.add(
                        BTxB_transpose,
                        tf.math.scalar_mul(tf.identity((BTxB_transpose.shape)),
                                           1e-6))
                    BTxB_inv = tf.matrix_inverse(BTxB_ttt,
                                                 adjoint=None,
                                                 name=None)
                BTxB_inv = tf.transpose(BTxB_inv, [1, 2, 0])
            elif nbit == 2:
                det = tf.multiply(BTxB[0][0], BTxB[1][1]) - tf.multiply(
                    BTxB[0][1], BTxB[1][0])
                inv = []
                inv.append(BTxB[1][1] / det)
                inv.append(-BTxB[0][1] / det)
                inv.append(-BTxB[1][0] / det)
                inv.append(BTxB[0][0] / det)
                BTxB_inv = tf.reshape(tf.stack(values=inv),
                                      [nbit, nbit, num_filters])
            elif nbit == 1:
                BTxB_inv = tf.reciprocal(BTxB)

            # calculate BTxX
            BTxX = []
            for i in range(nbit):
                BTxXi0 = tf.multiply(BT[i], reshape_x)
                BTxXi0 = tf.matmul(sum_multiplier, BTxXi0)
                BTxX.append(BTxXi0)
            BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, num_filters])
            BTxX = BTxX + (delta * basis)  # + basis

            # calculate new basis
            new_basis = []
            for i in range(nbit):
                new_basis_i = tf.multiply(BTxB_inv[i], BTxX)
                new_basis_i = tf.matmul(sum_multiplier_basis, new_basis_i)
                add_moving_summary(
                    tf.reduce_mean(new_basis_i, name='new_basis_bit' + str(i)))
                new_basis.append(new_basis_i)
            new_basis = tf.reshape(tf.stack(values=new_basis),
                                   [nbit, num_filters])

            # create moving averages op
            updata_moving_basis = moving_averages.assign_moving_average(
                basis, new_basis, MOVING_AVERAGES_FACTOR)
            add_model_variable(basis)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)

            # add_moving_summary(tf.identity(basis, name='basis'), tf.identity(new_basis, name='basis_new'))
            # add_moving_summary(tf.identity(basis, name='basis'))

        y = x + tf.stop_gradient(-x) + tf.stop_gradient(y)  # gradient: y=x
        y.variables = VariableHolder(basis=basis)
        return y
示例#14
0
def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    """
    Batch normalization layer as described in:

    `Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.

    :param input: a NHWC or NC tensor
    :param use_local_stat: bool. whether to use mean/var of this batch or the moving average.
        Default to True in training and False in inference.
    :param decay: decay rate. default to 0.9.
    :param epsilon: default to 1e-5.

    Note that only the first training tower maintains a moving average.
    """
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
    n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    if len(shape) == 2:
        x = tf.reshape(x, [-1, 1, 1, n_out])

    beta = tf.get_variable('beta', [n_out],
                           initializer=tf.constant_initializer())
    gamma = tf.get_variable('gamma', [n_out],
                            initializer=tf.constant_initializer(1.0))
    # x * gamma + beta

    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    if use_local_stat != ctx.is_training:
        logger.warn("[BatchNorm] use_local_stat != is_training")

    moving_mean = tf.get_variable('mean/EMA', [n_out],
                                  initializer=tf.constant_initializer(),
                                  trainable=False)
    moving_var = tf.get_variable('variance/EMA', [n_out],
                                 initializer=tf.constant_initializer(),
                                 trainable=False)

    if use_local_stat:
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x,
                                                           gamma,
                                                           beta,
                                                           epsilon=epsilon,
                                                           is_training=True)

        # maintain EMA only in the main training tower
        if ctx.is_main_training_tower:
            update_op1 = moving_averages.assign_moving_average(
                moving_mean,
                batch_mean,
                decay,
                zero_debias=False,
                name='mean_ema_op')
            update_op2 = moving_averages.assign_moving_average(
                moving_var,
                batch_var,
                decay,
                zero_debias=False,
                name='var_ema_op')
            add_model_variable(moving_mean)
            add_model_variable(moving_var)
    else:
        assert not ctx.is_training, "In training, local statistics has to be used!"
        # TODO do I need to add_model_variable.
        # consider some fixed-param tasks, such as load model and fine tune one layer

        # fused seems slower in inference
        #xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta,
        #moving_mean, moving_var,
        #epsilon=epsilon, is_training=False, name='output')
        xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma,
                                       epsilon)

    # TODO for other towers, maybe can make it depend some op later
    if ctx.is_main_training_tower:
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='output')
    else:
        return tf.identity(xn, name='output')
示例#15
0
def BatchNorm(x,
              use_local_stat=None,
              decay=0.9,
              epsilon=1e-5,
              use_scale=True,
              use_bias=True,
              gamma_init=tf.constant_initializer(1.0),
              data_format='NHWC',
              internal_update=False):
    """
    Batch Normalization layer, as described in the paper:
    `Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.

    Args:
        x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
        use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
            Defaults to True in training and False in inference.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.
        gamma_init: initializer for gamma (the scale).
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            which will be slightly slower.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``use_local_stat`` and ``ctx.is_training``:
            * ``use_local_stat == is_training``: standard BN, EMA are
                maintained during training and used during inference.
            * ``use_local_stat and not is_training``: still use local (batch)
                statistics in inference.
            * ``not use_local_stat and is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    beta, gamma, moving_mean, moving_var = get_bn_variables(
        n_out, use_scale, use_bias, gamma_init)

    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)

    if use_local_stat:
        if ndims == 2:
            x = tf.reshape(x,
                           [-1, 1, 1, n_out])  # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
            x,
            gamma,
            beta,
            epsilon=epsilon,
            is_training=True,
            data_format=data_format)

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])
    else:
        if ctx.is_training:
            assert get_tf_version_number() >= 1.4, \
                "Fine tuning a BatchNorm model with fixed statistics is only " \
                "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
            if ctx.is_main_training_tower:  # only warn in first tower
                logger.warn(
                    "[BatchNorm] Using moving_mean/moving_variance in training."
                )
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, _, _ = tf.nn.fused_batch_norm(x,
                                              gamma,
                                              beta,
                                              mean=moving_mean,
                                              variance=moving_var,
                                              epsilon=epsilon,
                                              data_format=data_format,
                                              is_training=False)
        else:
            # non-fused op is faster for inference  # TODO test if this is still true
            if ndims == 4 and data_format == 'NCHW':
                [g, b, mm, mv] = [
                    reshape_for_bn(_, ndims, n_out, data_format)
                    for _ in [gamma, beta, moving_mean, moving_var]
                ]
                xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon)
            else:
                # avoid the reshape if possible (when channel is the last dimension)
                xn = tf.nn.batch_normalization(x, moving_mean, moving_var,
                                               beta, gamma, epsilon)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
    if ctx.is_main_training_tower and use_local_stat:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var,
                            decay, internal_update)
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if use_scale:
        vh.gamma = gamma
    if use_bias:
        vh.beta = beta
    return ret
示例#16
0
def QuantizedActiv(x, nbit=2):
    """
    Quantize activation.
    Args:
        x (tf.Tensor): a 4D tensor.
        nbit (int): number of bits of quantized activation. Defaults to 2.
    Returns:
        tf.Tensor with attribute `variables`.
    Variable Names:
    * ``basis``: basis of quantized activation.
    Note:
        About multi-GPU training: moving averages across GPUs are not aggregated.
        Batch statistics are computed by main training tower. This is consistent with most frameworks.
    """
    init_basis = [(NORM_PPF_0_75 * 2 / (2**nbit - 1)) * (2.**i)
                  for i in range(nbit)]  #初始化基向量
    init_basis = tf.constant_initializer(init_basis)
    bit_dims = [nbit, 1]
    num_levels = 2**nbit  #量化级别数
    # initialize level multiplier

    #该过程为初始化一个量化级别码,是一个2**nbit个元素的列表,其中每个元素都是一个nbit位的 {0,1}码,例如,对于7,那么对应的编码为:{0,1,2,3: 1,1,1,0}
    init_level_multiplier = []
    for i in range(0, num_levels):
        level_multiplier_i = [0. for j in range(nbit)]
        level_number = i
        for j in range(nbit):
            level_multiplier_i[j] = float(level_number % 2)
            level_number = level_number // 2
        init_level_multiplier.append(level_multiplier_i)
    #该过程初始化一个 threshold multiplier,共有2**nbit-1 个元素,其中每个元素都是一个具有2** nbit个元素的列表,列表中的值为初始值
    #例如:[[0.5,0,5,0,0,...,0,0], [0,0.5,0.5,0,0,...,0], ... , [0,0,0,...,0,0,0.5,0.5]]
    # initialize threshold multiplier
    init_thrs_multiplier = []
    for i in range(1, num_levels):
        thrs_multiplier_i = [0. for j in range(num_levels)]
        thrs_multiplier_i[i - 1] = 0.5
        thrs_multiplier_i[i] = 0.5
        iniinit_thrs_multipliert_thrs_multiplier.append(thrs_multiplier_i)
    # init_thrs_multiplier_shape:15_16
    with tf.variable_scope('ActivationQuantization'):
        basis = tf.get_variable('basis',
                                bit_dims,
                                tf.float32,
                                initializer=init_basis,
                                trainable=False)

        ctx = get_current_tower_context()  # current tower context
        # calculate levels and sort
        level_codes = tf.constant(init_level_multiplier)
        levels = tf.matmul(
            level_codes,
            basis)  #V*B表示每个量化级别所对应的量化值,例如0, v1, v2, v1+v2, v3, v3+v1
        levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]),
                                      num_levels)  #从大到小排序
        levels = tf.reverse(levels, [-1])  #从小到大
        sort_id = tf.reverse(sort_id, [-1])  #对应的索引从小到大
        levels = tf.transpose(
            levels, [1, 0])  #V*B表示每个量化级别所对应的量化值,以4bit为例,levels_shape:16*1
        sort_id = tf.transpose(sort_id, [1, 0])
        # calculate threshold
        thrs_multiplier = tf.constant(init_thrs_multiplier)  #shape:15_16
        thrs = tf.matmul(
            thrs_multiplier, levels
        )  #表示每个量化级别所对应的量化范围,论文中的(q(l-1)+q(l))/2, shape:15_16*16_1=15_1
        # calculate output y and its binary code,给量化后的输出分配空间,包括量化值和量化码,假设x_shape:1_5_2_2
        y = tf.zeros_like(x)  # output
        reshape_x = tf.reshape(x, [-1])  #shape:1*5*2*2=20
        zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit])  #shape:20_4
        bits_y = tf.fill(zero_dims, 0.)
        zero_y = tf.zeros_like(x)
        zero_bits_y = tf.fill(zero_dims, 0.)
        for i in range(num_levels - 1):
            g = tf.greater(x, thrs[i])  #判断
            y = tf.where(g, zero_y + levels[i + 1],
                         y)  #大于该级别threshold的值的, 量化值就等于该级别所对应的量化值
            bits_y = tf.where(tf.reshape(g, [-1]),
                              zero_bits_y + level_codes[sort_id[i + 1][0]],
                              bits_y)  #该级别所对应的量化码
        # training
        if ctx.is_main_training_tower:
            BT = tf.matrix_transpose(bits_y)  #shape:4_20
            # calculate BTxB
            BTxB = []
            for i in range(nbit):
                for j in range(nbit):
                    BTxBij = tf.multiply(BT[i], BT[j])
                    BTxBij = tf.reduce_sum(BTxBij)
                    BTxB.append(BTxBij)
            BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit])
            BTxB_inv = tf.matrix_inverse(BTxB)
            # calculate BTxX
            BTxX = []
            for i in range(nbit):
                BTxXi0 = tf.multiply(BT[i], reshape_x)
                BTxXi0 = tf.reduce_sum(BTxXi0)
                BTxX.append(BTxXi0)
            BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1])

            new_basis = tf.matmul(BTxB_inv, BTxX)  # calculate new basis
            # create moving averages op
            updata_moving_basis = moving_averages.assign_moving_average(
                basis, new_basis, MOVING_AVERAGES_FACTOR)
            add_model_variable(basis)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)

            for i in range(nbit):
                tf.summary.scalar('basis%d' % i, new_basis[i][0])

        x_clip = tf.minimum(x, levels[num_levels - 1])  # gradient clip
        y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient(
            y)  # gradient: y=clip(x)
        y.variables = VariableHolder(basis=basis)
        return y
示例#17
0
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    """
    Batch normalization layer, as described in the paper:
    `Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
            Defaults to True in training and False in inference.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        In multi-tower training, only the first training tower maintains a moving average.
        This is consistent with most frameworks.
    """
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
    n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    if len(shape) == 2:
        x = tf.reshape(x, [-1, 1, 1, n_out])

    beta = tf.get_variable('beta', [n_out],
                           initializer=tf.constant_initializer())
    gamma = tf.get_variable('gamma', [n_out],
                            initializer=tf.constant_initializer(1.0))
    # x * gamma + beta

    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    if use_local_stat != ctx.is_training:
        logger.warn("[BatchNorm] use_local_stat != is_training")

    moving_mean = tf.get_variable('mean/EMA', [n_out],
                                  initializer=tf.constant_initializer(),
                                  trainable=False)
    moving_var = tf.get_variable('variance/EMA', [n_out],
                                 initializer=tf.constant_initializer(),
                                 trainable=False)

    if use_local_stat:
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x,
                                                           gamma,
                                                           beta,
                                                           epsilon=epsilon,
                                                           is_training=True)

        # maintain EMA only in the main training tower
        if ctx.is_main_training_tower:
            # TODO a way to use debias in multitower.
            update_op1 = moving_averages.assign_moving_average(
                moving_mean,
                batch_mean,
                decay,
                zero_debias=False,
                name='mean_ema_op')
            update_op2 = moving_averages.assign_moving_average(
                moving_var,
                batch_var,
                decay,
                zero_debias=False,
                name='var_ema_op')
            add_model_variable(moving_mean)
            add_model_variable(moving_var)
    else:
        assert not ctx.is_training, "In training, local statistics has to be used!"
        # TODO do I need to add_model_variable.
        # consider some fixed-param tasks, such as load model and fine tune one layer

        # fused seems slower in inference
        # xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta,
        #   moving_mean, moving_var,
        #   epsilon=epsilon, is_training=False, name='output')
        xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma,
                                       epsilon)

    if len(shape) == 2:
        xn = tf.squeeze(xn, [1, 2])

    # TODO for other towers, maybe can make it depend some op later
    # TODO update it later (similar to slim) might be faster?
    if ctx.is_main_training_tower:
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='output')
    else:
        return tf.identity(xn, name='output')
def BatchNorm(inputs,
              training=None,
              momentum=0.9,
              epsilon=1e-5,
              center=True,
              scale=True,
              gamma_initializer=tf.ones_initializer(),
              data_format='channels_last',
              internal_update=False):
    """
    Mostly equivalent to `tf.layers.batch_normalization`, but difference in
    the following:
    1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from `TowerContext`.
    4. Support the `internal_update` option.
    Args:
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            by control dependencies.
    Variable Names:
    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``training`` and ``ctx.is_training``:
            * ``training == ctx.is_training``: standard BN, EMA are
                maintained during training and used during inference. This is
                the default.
            * ``training and not ctx.is_training``: still use batch statistics in inference.
            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    beta, gamma, moving_mean, moving_var = get_bn_variables(
        n_out, scale, center, gamma_initializer)

    ctx = get_current_tower_context()
    use_local_stat = training
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)

    if use_local_stat:
        if ndims == 2:
            inputs = tf.reshape(
                inputs, [-1, 1, 1, n_out])  # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
            inputs,
            gamma,
            beta,
            epsilon=epsilon,
            is_training=True,
            data_format=data_format)

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])
    else:
        if ctx.is_training:
            assert get_tf_version_tuple() >= (1, 4), \
                "Fine tuning a BatchNorm model with fixed statistics is only " \
                "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
            if ctx.is_main_training_tower:  # only warn in first tower
                logger.warn(
                    "[BatchNorm] Using moving_mean/moving_variance in training."
                )
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, _, _ = tf.nn.fused_batch_norm(inputs,
                                              gamma,
                                              beta,
                                              mean=moving_mean,
                                              variance=moving_var,
                                              epsilon=epsilon,
                                              data_format=data_format,
                                              is_training=False)
        else:
            if ndims == 4:
                xn, _, _ = tf.nn.fused_batch_norm(inputs,
                                                  gamma,
                                                  beta,
                                                  mean=moving_mean,
                                                  variance=moving_var,
                                                  epsilon=epsilon,
                                                  data_format=data_format,
                                                  is_training=False)
            else:
                xn = tf.nn.batch_normalization(inputs, moving_mean, moving_var,
                                               beta, gamma, epsilon)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
    if ctx.is_main_training_tower and use_local_stat:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var,
                            momentum, internal_update)
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if scale:
        vh.gamma = gamma
    if center:
        vh.beta = beta
    return ret
示例#19
0
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
                use_scale=True, use_bias=True, gamma_init=None,
                data_format='channels_last'):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.
    This implementation is a wrapper around `tf.layers.batch_normalization`.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation.
    * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation.
    """

    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'channels_last'    # error using NCHW? (see #190)
        x = tf.reshape(x, [-1, 1, 1, shape[1]])

    ctx = get_current_tower_context()
    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    layer = tf.layers.BatchNormalization(
        axis=1 if data_format == 'channels_first' else 3,
        momentum=decay, epsilon=epsilon,
        center=use_bias, scale=use_scale,
        renorm=True,
        renorm_clipping={
            'rmin': 1.0 / rmax,
            'rmax': rmax,
            'dmax': dmax},
        renorm_momentum=0.99,
        gamma_initializer=gamma_init,
        fused=False)
    xn = layer.apply(x, training=ctx.is_training, scope=tf.get_variable_scope())

    if ctx.is_main_training_tower:
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    else:
        # only run UPDATE_OPS in the first tower
        restore_collection(coll_bk)

    if ndims == 2:
        xn = tf.squeeze(xn, [1, 2])
    ret = tf.identity(xn, name='output')

    # TODO not sure whether to add moving_mean/moving_var to VH now
    vh = ret.variables = VariableHolder()
    if use_scale:
        vh.gamma = layer.gamma
    if use_bias:
        vh.beta = layer.beta
    return ret
示例#20
0
def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]

    n_out = shape[-1]  # channel
    assert n_out is not None
    beta = tf.get_variable('beta', [n_out],
                           initializer=tf.constant_initializer())
    gamma = tf.get_variable('gamma', [n_out],
                            initializer=tf.constant_initializer(1.0))

    if len(shape) == 2:
        batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False)
    else:
        batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False)
    # just to make a clear name.
    batch_mean = tf.identity(batch_mean, 'mean')
    batch_var = tf.identity(batch_var, 'variance')

    emaname = 'EMA'
    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    if use_local_stat != ctx.is_training:
        logger.warn("[BatchNorm] use_local_stat != is_training")

    if use_local_stat:
        # training tower
        if ctx.is_training:
            # reuse = tf.get_variable_scope().reuse
            with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                # BatchNorm in reuse scope can be tricky! Moving mean/variance are not reused
                with tf.name_scope(None):  # https://github.com/tensorflow/tensorflow/issues/2740
                    # if reuse=True, try to find and use the existing statistics
                    # how to use multiple tensors to update one EMA? seems impossbile
                    ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname)
                    ema_apply_op = ema.apply([batch_mean, batch_var])
                    ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var)
                    if ctx.is_main_training_tower:
                        # inside main training tower
                        add_model_variable(ema_mean)
                        add_model_variable(ema_var)
    else:
        # no apply() is called here, no magic vars will get created,
        # no reuse issue will happen
        assert not ctx.is_training
        with tf.name_scope(None):
            ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname)
            mean_var_name = ema.average_name(batch_mean)
            var_var_name = ema.average_name(batch_var)
            if ctx.is_main_tower:
                # main tower, but needs to use global stat. global stat must be from outside
                # when reuse=True, the desired variable name could
                # actually be different, because a different var is created
                # for different reuse tower
                ema_mean = tf.get_variable('mean/' + emaname, [n_out])
                ema_var = tf.get_variable('variance/' + emaname, [n_out])
            else:
                # use statistics in another tower
                G = tf.get_default_graph()
                ema_mean = ctx.find_tensor_in_main_tower(G, mean_var_name + ':0')
                ema_var = ctx.find_tensor_in_main_tower(G, var_var_name + ':0')

    if use_local_stat:
        batch = tf.cast(tf.shape(x)[0], tf.float32)
        mul = tf.where(tf.equal(batch, 1.0), 1.0, batch / (batch - 1))
        batch_var = batch_var * mul  # use unbiased variance estimator in training

        with tf.control_dependencies([ema_apply_op] if ctx.is_training else []):
            # only apply EMA op if is_training
            return tf.nn.batch_normalization(
                x, batch_mean, batch_var, beta, gamma, epsilon, 'output')
    else:
        return tf.nn.batch_normalization(
            x, ema_mean, ema_var, beta, gamma, epsilon, 'output')
示例#21
0
def BatchRenorm(x,
                rmax,
                dmax,
                momentum=0.9,
                epsilon=1e-5,
                center=True,
                scale=True,
                gamma_initializer=None,
                data_format='channels_last'):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.
    This implementation is a wrapper around `tf.layers.batch_normalization`.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation.
    * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation.
    """

    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'channels_first'

    ctx = get_current_tower_context()
    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    layer = tf.layers.BatchNormalization(
        axis=1 if data_format == 'channels_first' else 3,
        momentum=momentum,
        epsilon=epsilon,
        center=center,
        scale=scale,
        renorm=True,
        renorm_clipping={
            'rmin': 1.0 / rmax,
            'rmax': rmax,
            'dmax': dmax
        },
        renorm_momentum=0.99,
        gamma_initializer=gamma_initializer,
        fused=False,
        _reuse=tf.get_variable_scope().reuse)
    xn = layer.apply(x,
                     training=ctx.is_training,
                     scope=tf.get_variable_scope())

    if ctx.is_main_training_tower:
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    else:
        # only run UPDATE_OPS in the first tower
        restore_collection(coll_bk)

    if ndims == 2:
        xn = tf.squeeze(xn, [1, 2])
    ret = tf.identity(xn, name='output')

    # TODO not sure whether to add moving_mean/moving_var to VH now
    vh = ret.variables = VariableHolder()
    if scale:
        vh.gamma = layer.gamma
    if center:
        vh.beta = layer.beta
    return ret
示例#22
0
def sync_batch_norm(inputs,
                    decay=0.999,
                    axis=-1,
                    epsilon=0.001,
                    activation_fn=None,
                    updates_collections=tf.GraphKeys.UPDATE_OPS,
                    is_training=True,
                    reuse=None,
                    variables_collections=None,
                    trainable=True,
                    scope=None,
                    num_dev=1):
    '''
		num_dev is how many gpus you use.
		this function is from https://github.com/jianlong-yuan/syncbn-tensorflow/blob/master/syncbn.py
	'''
    # shape of inputs is [batch, height, width, depth]
    num_outputs = inputs.get_shape().as_list()[-1]
    # print (f"num_outputs = {num_outputs}")	# 3

    if scope is None:
        scope = 'batch_normalization'

    with tf.variable_scope(scope, reuse=reuse):
        # initializer, gamma and beta is trainable, moving_mean and moving_var is not
        gamma = tf.get_variable(name='gamma',
                                shape=[num_outputs],
                                dtype=tf.float32,
                                initializer=tf.constant_initializer(1.0),
                                trainable=trainable,
                                collections=variables_collections)

        beta = tf.get_variable(name='beta',
                               shape=[num_outputs],
                               dtype=tf.float32,
                               initializer=tf.constant_initializer(0.0),
                               trainable=trainable,
                               collections=variables_collections)

        moving_mean = tf.get_variable(name='moving_mean',
                                      shape=[num_outputs],
                                      dtype=tf.float32,
                                      initializer=tf.constant_initializer(0.0),
                                      trainable=False,
                                      collections=variables_collections)

        moving_var = tf.get_variable(name='moving_variance',
                                     shape=[num_outputs],
                                     dtype=tf.float32,
                                     initializer=tf.constant_initializer(1.0),
                                     trainable=False,
                                     collections=variables_collections)

        # is_training and trainable is logical and
        # this is same with [math_ops.logical_and())]
        # (https://github.com/tensorflow/tensorflow/blob/
        # 508f76b1d9925304cedd56d51480ec380636cb82/tensorflow/
        # python/keras/layers/normalization.py#L621)
        if is_training and trainable:
            # only one GPU
            if num_dev == 1:
                mean, var = tf.nn.moments(inputs, axes=axis)
            # multi GPUs
            else:
                # avarage moving_mean and moving_var in multi GPUs
                shared_name = tf.get_variable_scope().name
                batch_mean = tf.reduce_mean(inputs, axis=axis)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = tf.reduce_mean(tf.square(inputs),
                                                   axis=axis)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
                mean = batch_mean
                var = batch_mean_square - tf.square(batch_mean)
            outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma,
                                                epsilon)

            # print (outputs.device)	# /device:GPU:1

            # those code block is executed in every GPUs
            # just assign moving_mean and moving_var in GPU:0
            if int(outputs.device[-1]) == 0:
                update_moving_mean_op = tf.assign(
                    moving_mean, moving_mean * decay + mean * (1 - decay))
                update_moving_var_op = tf.assign(
                    moving_var, moving_var * decay + var * (1 - decay))
                add_model_variable(moving_mean)
                add_model_variable(moving_var)

                if updates_collections is None:
                    with tf.control_dependencies(
                        [update_moving_mean_op, update_moving_var_op]):
                        outputs = tf.identity(outputs)
                else:
                    tf.add_to_collections(updates_collections,
                                          update_moving_mean_op)
                    tf.add_to_collections(updates_collections,
                                          update_moving_var_op)
                    outputs = tf.identity(outputs)
            else:
                outputs = tf.identity(outputs)
        else:
            outputs, _, _ = tf.nn.fused_batch_norm(inputs,
                                                   gamma,
                                                   beta,
                                                   mean=moving_mean,
                                                   variance=moving_var,
                                                   epsilon=epsilon,
                                                   is_training=False)

        if activation_fn is not None:
            outputs = activation_fn(outputs)

        return outputs
示例#23
0
def syncBatchNorm(inputs,
                  axis=-1,
                  momentum=0.99,
                  epsilon=0.001,
                  updates_collections=tf.GraphKeys.UPDATE_OPS,
                  reuse=None,
                  variables_collections=None,
                  training=False,
                  trainable=True,
                  name=None,
                  GPUNumber=1):
    '''
		this function is from https://github.com/jianlong-yuan/syncbn-tensorflow/blob/master/syncbn.py
	'''
    shapeList = inputs.get_shape().as_list()
    num_outputs = shapeList[axis]
    # print (f"num_outputs = {num_outputs}")	# 512
    axes = [i for i in range(len(shapeList))]
    # when the dimension is 1, axes = [], this also run well!
    del axes[axis]
    # print (f"axes = {axes}")	# [0, 1, 2]

    if name is None:
        name = 'batch_normalization'

    with tf.variable_scope(name, reuse=reuse) as scope:
        # initializer, gamma and beta is trainable, moving_mean and moving_var is not
        gamma = tf.get_variable(name='gamma',
                                shape=[num_outputs],
                                dtype=tf.float32,
                                initializer=tf.constant_initializer(1.0),
                                trainable=trainable,
                                collections=variables_collections)

        beta = tf.get_variable(name='beta',
                               shape=[num_outputs],
                               dtype=tf.float32,
                               initializer=tf.constant_initializer(0.0),
                               trainable=trainable,
                               collections=variables_collections)

        moving_mean = tf.get_variable(name='moving_mean',
                                      shape=[num_outputs],
                                      dtype=tf.float32,
                                      initializer=tf.constant_initializer(0.0),
                                      trainable=False,
                                      collections=variables_collections)

        moving_var = tf.get_variable(name='moving_variance',
                                     shape=[num_outputs],
                                     dtype=tf.float32,
                                     initializer=tf.constant_initializer(1.0),
                                     trainable=False,
                                     collections=variables_collections)

        def branchTrue():
            '''
				update the batch mean and batch variance
			'''
            # only one GPU
            if GPUNumber == 1:
                batch_mean = tf.reduce_mean(inputs,
                                            axis=axes,
                                            name="batch_mean")
                batch_mean_square = tf.reduce_mean(tf.square(inputs),
                                                   axis=axes)
            # multi GPUs
            else:
                # avarage moving_mean and moving_var in multi GPUs
                shared_name = re.sub('tower[0-9]+/', '',
                                     tf.get_variable_scope().name)
                batch_mean = tf.reduce_mean(inputs, axis=axes)

                # Utilize NCCL
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=GPUNumber,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / GPUNumber)
                batch_mean_square = tf.reduce_mean(tf.square(inputs),
                                                   axis=axes)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=GPUNumber,
                    shared_name=shared_name +
                    '_NCCL_mean_square') * (1.0 / GPUNumber)

            batch_var = batch_mean_square - tf.square(batch_mean)

            outputs = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                                beta, gamma, epsilon)

            return outputs, batch_mean, batch_var

        def branchFalse():
            '''
				the same with moving_mean and moving_var
			'''
            outputs = tf.nn.batch_normalization(inputs, moving_mean,
                                                moving_var, beta, gamma,
                                                epsilon)

            # use the default tensor, this code will not update moving_mean and moving_var
            # for batch_mean+(moving_mean-batch_mean)*momentum = moving_mean
            # is batch_mean == moving_mean
            with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
                batch_mean = tf.get_variable("moving_mean")
                batch_var = tf.get_variable("moving_variance")
            return outputs, batch_mean, batch_var

        outputs, batch_mean, batch_var = tf.cond(
            tf.math.logical_and(training, trainable), branchTrue, branchFalse)

        # those code block is executed in every GPUs
        # just assign moving_mean and moving_var in GPU:0
        if int(outputs.device[-1]) == 0:
            update_moving_mean_op = tf.assign(
                moving_mean,
                batch_mean + (moving_mean - batch_mean) * momentum)
            update_moving_var_op = tf.assign(
                moving_var, batch_var + (moving_var - batch_var) * momentum)
            add_model_variable(moving_mean)
            add_model_variable(moving_var)

            if updates_collections is None:
                with tf.control_dependencies(
                    [update_moving_mean_op, update_moving_var_op]):
                    outputs = tf.identity(outputs)
            else:
                tf.add_to_collections(updates_collections,
                                      update_moving_mean_op)
                tf.add_to_collections(updates_collections,
                                      update_moving_var_op)
                outputs = tf.identity(outputs)
        else:
            outputs = tf.identity(outputs)

        return outputs
示例#24
0
文件: nn.py 项目: peternara/m-DAN
def BatchNorm(x,
              use_local_stat=False,
              decay=0.9,
              epsilon=1e-5,
              use_scale=True,
              use_bias=True,
              gamma_init=tf.constant_initializer(1.0),
              data_format='NCHW',
              internal_update=False,
              scope="bn"):
    global is_training
    with tf.variable_scope(scope):
        shape = x.get_shape().as_list()
        ndims = len(shape)
        assert ndims in [2, 4]
        if ndims == 2:
            data_format = 'NHWC'
        if data_format == 'NCHW':
            n_out = shape[1]
        else:
            n_out = shape[-1]  # channel
        assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
        beta, gamma, moving_mean, moving_var = get_bn_variables(
            n_out, use_scale, use_bias, gamma_init)

        use_local_stat = bool(use_local_stat)

        if use_local_stat:
            if ndims == 2:
                x = tf.reshape(
                    x, [-1, 1, 1, n_out])  # fused_bn only takes 4D input
                # fused_bn has error using NCHW? (see #190)

            xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
                x,
                gamma,
                beta,
                epsilon=epsilon,
                is_training=True,
                data_format=data_format)

            if ndims == 2:
                xn = tf.squeeze(xn, [1, 2])
        else:
            if is_training:  # so ugly
                #assert get_tf_version_number() >= 1.4, \
                #	"Fine tuning a BatchNorm model with fixed statistics is only " \
                #	"supported after https://github.com/tensorflow/tensorflow/pull/12580 "
                #if ctx.is_main_training_tower:  # only warn in first tower
                #	logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
                # Using moving_mean/moving_variance in training, which means we
                # loaded a pre-trained BN and only fine-tuning the affine part.
                xn, _, _ = tf.nn.fused_batch_norm(x,
                                                  gamma,
                                                  beta,
                                                  mean=moving_mean,
                                                  variance=moving_var,
                                                  epsilon=epsilon,
                                                  data_format=data_format,
                                                  is_training=False)
            else:
                # non-fused op is faster for inference  # TODO test if this is still true
                if ndims == 4 and data_format == 'NCHW':
                    [g, b, mm, mv] = [
                        reshape_for_bn(_, ndims, n_out, data_format)
                        for _ in [gamma, beta, moving_mean, moving_var]
                    ]
                    xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon)
                else:
                    # avoid the reshape if possible (when channel is the last dimension)
                    xn = tf.nn.batch_normalization(x, moving_mean, moving_var,
                                                   beta, gamma, epsilon)

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because training time doesn't use EMA
        #if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
        if use_local_stat:  # and ctx.is_main_training_tower:
            ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean,
                                moving_var, decay, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        return ret
示例#25
0
def QuantizedActiv(x, nbit=2):
    """
    Quantize activation.
    Args:
        x (tf.Tensor): a 4D tensor.
        nbit (int): number of bits of quantized activation. Defaults to 2.
    Returns:
        tf.Tensor with attribute `variables`.
    Variable Names:
    * ``basis``: basis of quantized activation.
    Note:
        About multi-GPU training: moving averages across GPUs are not aggregated.
        Batch statistics are computed by main training tower. This is consistent with most frameworks.
    """
    init_basis = [(NORM_PPF_0_75 * 2 / (2**nbit - 1)) * (2.**i)
                  for i in range(nbit)]
    init_basis = tf.constant_initializer(init_basis)

    bit_dims = [nbit, 1]
    num_levels = 2**nbit
    # initialize level multiplier
    init_level_multiplier = []
    for i in range(0, num_levels):
        level_multiplier_i = [0. for j in range(nbit)]
        level_number = i
        for j in range(nbit):
            level_multiplier_i[j] = float(level_number % 2)
            level_number = level_number // 2
        init_level_multiplier.append(level_multiplier_i)
    # initialize threshold multiplier
    init_thrs_multiplier = []
    for i in range(1, num_levels):
        thrs_multiplier_i = [0. for j in range(num_levels)]
        thrs_multiplier_i[i - 1] = 0.5
        thrs_multiplier_i[i] = 0.5
        init_thrs_multiplier.append(thrs_multiplier_i)

    with tf.variable_scope('ActivationQuantization'):
        basis = tf.get_variable('basis',
                                bit_dims,
                                tf.float32,
                                initializer=init_basis,
                                trainable=False)

        ctx = get_current_tower_context()  # current tower context
        # calculate levels and sort
        level_codes = tf.constant(init_level_multiplier)
        levels = tf.matmul(level_codes, basis)
        levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels)
        levels = tf.reverse(levels, [-1])
        sort_id = tf.reverse(sort_id, [-1])
        levels = tf.transpose(levels, [1, 0])
        sort_id = tf.transpose(sort_id, [1, 0])
        # calculate threshold
        thrs_multiplier = tf.constant(init_thrs_multiplier)
        thrs = tf.matmul(thrs_multiplier, levels)
        # calculate output y and its binary code
        y = tf.zeros_like(x)  # output
        reshape_x = tf.reshape(x, [-1])
        zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit])
        bits_y = tf.fill(zero_dims, 0.)
        zero_y = tf.zeros_like(x)
        zero_bits_y = tf.fill(zero_dims, 0.)
        for i in range(num_levels - 1):
            g = tf.greater(x, thrs[i])
            y = tf.where(g, zero_y + levels[i + 1], y)
            bits_y = tf.where(tf.reshape(g, [-1]),
                              zero_bits_y + level_codes[sort_id[i + 1][0]],
                              bits_y)
        # training
        if ctx.is_main_training_tower:
            BT = tf.matrix_transpose(bits_y)
            # calculate BTxB
            BTxB = []
            for i in range(nbit):
                for j in range(nbit):
                    BTxBij = tf.multiply(BT[i], BT[j])
                    BTxBij = tf.reduce_sum(BTxBij)
                    # all dimensions are reduced, and a tensor with a single element is returned.  i.e. 6
                    BTxB.append(BTxBij)
            BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit])
            # 1) naive
            # BTxB_inv = tf.matrix_inverse(BTxB)

            # 2) try excpet ->doesn't work well due to poor tf.matrix_inverse
            # try:
            #     BTxB_inv = tf.matrix_inverse(BTxB, adjoint=None, name=None)
            # except:
            #     BTxB_ttt = tf.add(BTxB, tf.math.scalar_mul(tf.identity((BTxB.shape)), 1e-4))
            #     BTxB_inv = tf.matrix_inverse(BTxB_ttt, adjoint=None, name=None)

            # calculate BTxX
            BTxX = []
            for i in range(nbit):
                BTxXi0 = tf.multiply(BT[i], reshape_x)
                BTxXi0 = tf.reduce_sum(BTxXi0)
                BTxX.append(BTxXi0)
            BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1])

            # new_basis = tf.matmul(BTxB_inv, BTxX)  # calculate new basis
            # 3) gaussian elimination
            new_basis = tf.linalg.lstsq(BTxB,
                                        BTxX,
                                        fast=False,
                                        l2_regularizer=1e-5)

            # create moving averages op
            updata_moving_basis = moving_averages.assign_moving_average(
                basis, new_basis, MOVING_AVERAGES_FACTOR)
            add_model_variable(basis)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis)

            for i in range(nbit):
                tf.summary.scalar('basis%d' % i, new_basis[i][0])

        x_clip = tf.minimum(x, levels[num_levels - 1])  # gradient clip
        y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient(
            y)  # gradient: y=clip(x)
        y.variables = VariableHolder(basis=basis)
        return y
示例#26
0
def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              gamma_initializer=tf.ones_initializer(),
              data_format='channels_last',
              internal_update=False):
    """
    Mostly equivalent to `tf.layers.batch_normalization`, but difference in
    the following:
    1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from `TowerContext`.
    4. Support the `internal_update` option.
    Args:
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            by control dependencies.
    Variable Names:
    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``training`` and ``ctx.is_training``:
            * ``training == ctx.is_training``: standard BN, EMA are
                maintained during training and used during inference. This is
                the default.
            * ``training and not ctx.is_training``: still use batch statistics in inference.
            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer)

    ctx = get_current_tower_context()
    use_local_stat = training
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)

    if use_local_stat:
        if ndims == 2:
            inputs = tf.reshape(inputs, [-1, 1, 1, n_out])    # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
            inputs, gamma, beta, epsilon=epsilon,
            is_training=True, data_format=data_format)

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])
    else:
        if ctx.is_training:
            assert get_tf_version_number() >= 1.4, \
                "Fine tuning a BatchNorm model with fixed statistics is only " \
                "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
            if ctx.is_main_training_tower:  # only warn in first tower
                logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, _, _ = tf.nn.fused_batch_norm(
                inputs, gamma, beta,
                mean=moving_mean, variance=moving_var, epsilon=epsilon,
                data_format=data_format, is_training=False)
        else:
            if ndims == 4:
                xn, _, _ = tf.nn.fused_batch_norm(
                    inputs, gamma, beta,
                    mean=moving_mean, variance=moving_var, epsilon=epsilon,
                    data_format=data_format, is_training=False)
            else:
                # avoid the reshape if possible (when channel is the last dimension)
                xn = tf.nn.batch_normalization(
                    inputs, moving_mean, moving_var, beta, gamma, epsilon)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
    if ctx.is_main_training_tower and use_local_stat:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, momentum, internal_update)
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if scale:
        vh.gamma = gamma
    if center:
        vh.beta = beta
    return ret
示例#27
0
def BatchNorm(inputs,
              axis=None,
              training=None,
              momentum=0.9,
              epsilon=1e-5,
              center=True,
              scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False):
    """
    Mostly equivalent to `tf.layers.batch_normalization`, but different in
    the following:

    1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from `TowerContext`.
    4. Support the `internal_update` option.

    Args:
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            by control dependencies.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``training`` and ``ctx.is_training``:
            * ``training == ctx.is_training``: standard BN, EMA are
                maintained during training and used during inference. This is
                the default.
            * ``training and not ctx.is_training``: still use batch statistics in inference.
            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_number()
    if not training and ctx.is_training:
        assert TF_version >= 1.4, \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    with rename_get_variable({
            'moving_mean': 'mean/EMA',
            'moving_variance': 'variance/EMA'
    }):
        if TF_version >= 1.5:
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                virtual_batch_size=virtual_batch_size,
                fused=True)
        else:
            assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=True)
        xn = layer.apply(inputs,
                         training=training,
                         scope=tf.get_variable_scope())

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    if not ctx.is_main_training_tower or internal_update:
        restore_collection(coll_bk)

    if training and internal_update:
        assert layer.updates
        with tf.control_dependencies(layer.updates):
            ret = tf.identity(xn, name='output')
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(
        moving_mean=layer.moving_mean,
        mean=layer.moving_mean,  # for backward-compatibility
        moving_variance=layer.moving_variance,
        variance=layer.moving_variance)  # for backward-compatibility
    if scale:
        vh.gamma = layer.gamma
    if center:
        vh.beta = layer.beta
    return ret
def BatchNorm_SplitGPU(x, use_local_stat=None, decay=0.9, epsilon=1e-5,
              use_scale=True, use_bias=True,
              gamma_init=tf.constant_initializer(1.0), data_format='NHWC',
              internal_update=False, split_num = 1):
    """
    """
    print split_num
    if data_format == 'channels_last':
       data_format = 'NHWC'
    assert data_format == 'NHWC'
    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, use_scale, use_bias, gamma_init)

    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)
    
    if use_local_stat:
        if ndims == 2:
            x = tf.reshape(x, [-1, 1, 1, n_out])    # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)    
        
        inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/S_n x H x W x C*S_n
        beta_, gamma_ = None, None
        beta_ = tf.reshape([beta]*split_num, [-1])
        gamma_ = tf.reshape([gamma]*split_num, [-1])
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(inputs, gamma_, beta_,epsilon=epsilon,is_training=True, data_format=data_format)
        xn = tf.concat(tf.split(xn, split_num, 3), 0)      
        
        """
        """
        # inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/split_num x H x W x C*split_num
        # axis = [0, 1, 2]
        # batch_mean, batch_var = tf.nn.moments(inputs, axis) # C*split_num
        # beta_, gamma_ = None, None
        # beta_ = tf.reshape([beta]*split_num, [-1])
        # gamma_ = tf.reshape([gamma]*split_num, [-1])
        # xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta_, gamma_, epsilon)
        # xn = tf.concat(tf.split(xn, split_num, 3), 0)

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])
    else:
        if ctx.is_training:
            assert get_tf_version_number() < 1.4, \
                "Fine tuning a BatchNorm model with fixed statistics is only " \
                "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
            if ctx.is_main_training_tower:  # only warn in first tower
                logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
                x, gamma, beta,
                mean=moving_mean, variance=moving_var, epsilon=epsilon,
                data_format=data_format, is_training=False)
        else:
           
            if ndims == 4 and data_format == 'NCHW':
                [g, b, mm, mv] = [reshape_for_bn(_, ndims, n_out, data_format)
                                  for _ in [gamma, beta, moving_mean, moving_var]]
                xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon)
                batch_mean = tf.concat([moving_mean] * split_num, 0)
                batch_var = tf.concat([moving_var] * split_num, 0)
            else:
                # avoid the reshape if possible (when channel is the last dimension)
                xn = tf.nn.batch_normalization(
                    x, moving_mean, moving_var, beta, gamma, epsilon)
                batch_mean = tf.concat([moving_mean] * split_num, 0)
                batch_var = tf.concat([moving_var] * split_num, 0)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
    if ctx.is_main_training_tower and use_local_stat:
        # print (xn)
        ret = update_bn_ema(xn, batch_mean[:n_out], batch_var[:n_out], moving_mean, moving_var, decay, internal_update)
    else:
        ret = tf.identity(xn, name='output')
    ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if use_scale:
        vh.gamma = gamma
    if use_bias:
        vh.beta = beta
    assert batch_mean is not None, 'batch_mean outputs is None'
    return ret
示例#29
0
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False):
    """
    Mostly equivalent to `tf.layers.batch_normalization`, but different in
    the following:

    1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from `TowerContext`.
    4. Support the `internal_update` option.

    Args:
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            by control dependencies.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``training`` and ``ctx.is_training``:
            * ``training == ctx.is_training``: standard BN, EMA are
                maintained during training and used during inference. This is
                the default.
            * ``training and not ctx.is_training``: still use batch statistics in inference.
            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_number()
    if not training and ctx.is_training:
        assert TF_version >= 1.4, \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    with rename_get_variable(
            {'moving_mean': 'mean/EMA',
             'moving_variance': 'variance/EMA'}):
        if TF_version >= 1.5:
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                virtual_batch_size=virtual_batch_size,
                fused=True
            )
        else:
            assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=True
            )
        xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    if not ctx.is_main_training_tower or internal_update:
        restore_collection(coll_bk)

    if training and internal_update:
        assert layer.updates
        with tf.control_dependencies(layer.updates):
            ret = tf.identity(xn, name='output')
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(
        moving_mean=layer.moving_mean,
        mean=layer.moving_mean,  # for backward-compatibility
        moving_variance=layer.moving_variance,
        variance=layer.moving_variance)  # for backward-compatibility
    if scale:
        vh.gamma = layer.gamma
    if center:
        vh.beta = layer.beta
    return ret
示例#30
0
def sync_batch_norm(inputs,
                    is_training=True,
                    scope=None,
                    red_axises=[0, 1, 2],
                    bn_decay=0.999,
                    epsilon=0.001,
                    activation_fn=None,
                    updates_collections=tf.GraphKeys.UPDATE_OPS,
                    reuse=None,
                    variables_collections=None,
                    is_trainable=True,
                    num_dev=3):
    '''
    num_dev is how many gpus you use.
    '''
    # red_axises = [0, 1, 2]
    num_outputs = inputs.get_shape().as_list()[-1]

    if scope is None:
        scope = 'BatchNorm'

    with tf.variable_scope(scope, 'BatchNorm', reuse=reuse):

        gamma = tf.get_variable(name='gamma',
                                shape=[num_outputs],
                                dtype=tf.float32,
                                initializer=tf.constant_initializer(1.0),
                                trainable=is_trainable,
                                collections=variables_collections)

        beta = tf.get_variable(name='beta',
                               shape=[num_outputs],
                               dtype=tf.float32,
                               initializer=tf.constant_initializer(0.0),
                               trainable=is_trainable,
                               collections=variables_collections)

        moving_mean = tf.get_variable(name='moving_mean',
                                      shape=[num_outputs],
                                      dtype=tf.float32,
                                      initializer=tf.constant_initializer(0.0),
                                      trainable=False,
                                      collections=variables_collections)

        moving_var = tf.get_variable(name='moving_variance',
                                     shape=[num_outputs],
                                     dtype=tf.float32,
                                     initializer=tf.constant_initializer(1.0),
                                     trainable=False,
                                     collections=variables_collections)

        if is_training is not None and is_trainable is not None:
            if num_dev == 1:
                mean, var = tf.nn.moments(inputs, red_axises)
            else:
                shared_name = tf.get_variable_scope().name
                batch_mean = tf.reduce_mean(inputs, axis=red_axises)
                batch_mean_square = tf.reduce_mean(tf.square(inputs),
                                                   axis=red_axises)
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
                mean = batch_mean
                var = batch_mean_square - tf.square(batch_mean)
            outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma,
                                                epsilon)

            if int(outputs.device[-1]) == 0:
                update_moving_mean_op = tf.assign(
                    moving_mean,
                    moving_mean * bn_decay + mean * (1 - bn_decay))
                update_moving_var_op = tf.assign(
                    moving_var, moving_var * bn_decay + var * (1 - bn_decay))
                add_model_variable(moving_mean)
                add_model_variable(moving_var)

                if updates_collections is None:
                    with tf.control_dependencies(
                        [update_moving_mean_op, update_moving_var_op]):
                        outputs = tf.identity(outputs)
                else:
                    tf.add_to_collections(updates_collections,
                                          update_moving_mean_op)
                    tf.add_to_collections(updates_collections,
                                          update_moving_var_op)
                    outputs = tf.identity(outputs)
            else:
                outputs = tf.identity(outputs)

        else:
            outputs, _, _ = tf.nn.fused_batch_norm(inputs,
                                                   gamma,
                                                   beta,
                                                   mean=moving_mean,
                                                   variance=moving_var,
                                                   epsilon=epsilon,
                                                   is_training=False)

        #if activation_fn is not None:
        #   outputs = activation_fn(outputs)
    return outputs
示例#31
0
def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]

    n_out = shape[-1]  # channel
    assert n_out is not None
    beta = tf.get_variable('beta', [n_out],
                           initializer=tf.constant_initializer())
    gamma = tf.get_variable('gamma', [n_out],
                            initializer=tf.constant_initializer(1.0))

    if len(shape) == 2:
        batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False)
    else:
        batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False)
    # just to make a clear name.
    batch_mean = tf.identity(batch_mean, 'mean')
    batch_var = tf.identity(batch_var, 'variance')

    emaname = 'EMA'
    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    if use_local_stat != ctx.is_training:
        logger.warn("[BatchNorm] use_local_stat != is_training")

    if use_local_stat:
        # training tower
        if ctx.is_training:
            # reuse = tf.get_variable_scope().reuse
            with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                # BatchNorm in reuse scope can be tricky! Moving mean/variance are not reused
                with tf.name_scope(
                        None
                ):  # https://github.com/tensorflow/tensorflow/issues/2740
                    # TODO if reuse=True, try to find and use the existing statistics
                    # how to use multiple tensors to update one EMA? seems impossbile
                    ema = tf.train.ExponentialMovingAverage(decay=decay,
                                                            name=emaname)
                    ema_apply_op = ema.apply([batch_mean, batch_var])
                    ema_mean, ema_var = ema.average(batch_mean), ema.average(
                        batch_var)
                    if ctx.is_main_training_tower:
                        # inside main training tower
                        add_model_variable(ema_mean)
                        add_model_variable(ema_var)
    else:
        # no apply() is called here, no magic vars will get created,
        # no reuse issue will happen
        assert not ctx.is_training
        with tf.name_scope(None):
            ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname)
            mean_var_name = ema.average_name(batch_mean)
            var_var_name = ema.average_name(batch_var)
            if ctx.is_main_tower:
                # main tower, but needs to use global stat. global stat must be from outside
                # TODO when reuse=True, the desired variable name could
                # actually be different, because a different var is created
                # for different reuse tower
                ema_mean = tf.get_variable('mean/' + emaname, [n_out])
                ema_var = tf.get_variable('variance/' + emaname, [n_out])
            else:
                # use statistics in another tower
                G = tf.get_default_graph()
                ema_mean = ctx.find_tensor_in_main_tower(
                    G, mean_var_name + ':0')
                ema_var = ctx.find_tensor_in_main_tower(G, var_var_name + ':0')

    if use_local_stat:
        batch = tf.cast(tf.shape(x)[0], tf.float32)
        mul = tf.where(tf.equal(batch, 1.0), 1.0, batch / (batch - 1))
        batch_var = batch_var * mul  # use unbiased variance estimator in training

        with tf.control_dependencies(
            [ema_apply_op] if ctx.is_training else []):
            # only apply EMA op if is_training
            return tf.nn.batch_normalization(x, batch_mean, batch_var, beta,
                                             gamma, epsilon, 'output')
    else:
        return tf.nn.batch_normalization(x, ema_mean, ema_var, beta, gamma,
                                         epsilon, 'output')
示例#32
0
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
                use_scale=True, use_bias=True, data_format='NHWC'):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.
    This implementation is a wrapper around `tf.layers.batch_normalization`.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation.
    * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation.
    """

    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'    # error using NCHW? (see #190)
        x = tf.reshape(x, [-1, 1, 1, shape[1]])
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchRenorm cannot have unknown channels!"

    ctx = get_current_tower_context()
    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    layer = tf.layers.BatchNormalization(
        axis=1 if data_format == 'NCHW' else 3,
        momentum=decay, epsilon=epsilon,
        center=use_bias, scale=use_scale,
        renorm=True,
        renorm_clipping={
            'rmin': 1.0 / rmax,
            'rmax': rmax,
            'dmax': dmax},
        renorm_momentum=0.99,
        fused=False)
    xn = layer.apply(x, training=ctx.is_training, scope=tf.get_variable_scope())

    if ctx.has_own_variables:
        # Only apply update in this case.
        # Add these EMA to model_variables so that they will be synced
        # properly by replicated trainers.
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    else:
        # Don't need update if we are sharing variables from an existing tower
        restore_collection(coll_bk)

    if ndims == 2:
        xn = tf.squeeze(xn, [1, 2])
    ret = tf.identity(xn, name='output')

    # TODO not sure whether to add moving_mean/moving_var to VH now
    vh = ret.variables = VariableHolder()
    if use_scale:
        vh.gamma = layer.gamma
    if use_bias:
        vh.beta = layer.beta
    return ret
示例#33
0
def BatchNorm(inputs,
              axis=None,
              training=None,
              momentum=0.9,
              epsilon=1e-5,
              center=True,
              scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:

    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.

    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics (str or None): one of None "nccl", or "horovod".

          By default (None), it uses statistics of the input tensor to normalize.
          This is the standard way BatchNorm was done in most frameworks.

          When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers.
          It uses the aggregated statistics of the whole batch (across all GPUs) to normalize.

          When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`.
          It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize.
          Note that on single machine this is significantly slower than the "nccl" implementation.

          This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute
          global mean & variance. Therefore each GPU needs to have the same batch size.

          This option has no effect when not training.

          This option is also known as "Cross-GPU BatchNorm" as mentioned in:
          `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        Combinations of ``training`` and ``ctx.is_training``:

        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    if not training and ctx.is_training:
        assert TF_version >= (1, 4), \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn(
                "[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable({
                'moving_mean': 'mean/EMA',
                'moving_variance': 'variance/EMA'
        }):
            tf_args = dict(axis=axis,
                           momentum=momentum,
                           epsilon=epsilon,
                           center=center,
                           scale=scale,
                           beta_initializer=beta_initializer,
                           gamma_initializer=gamma_initializer,
                           fused=(ndims == 4 and axis in [1, 3]),
                           _reuse=tf.get_variable_scope().reuse)
            if TF_version >= (1, 5):
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs,
                             training=training,
                             scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else (
            [0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= (1, 9) and ctx.is_main_training_tower:
                logger.warn(
                    "A TensorFlow bug will cause cross-GPU BatchNorm to fail. "
                    "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360"
                )

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '',
                                 tf.get_variable_scope().name)
            num_dev = ctx.total
            if num_dev == 1:
                logger.warn(
                    "BatchNorm(sync_statistics='nccl') is used with only one tower!"
                )
            else:
                batch_mean = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
                batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                    input=batch_mean_square,
                    reduction='sum',
                    num_devices=num_dev,
                    shared_name=shared_name + '_NCCL_mean_square') * (1.0 /
                                                                      num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var,
                                           tf.reshape(beta, new_shape),
                                           tf.reshape(gamma, new_shape),
                                           epsilon)
        else:
            xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta,
                                           gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean,
                                moving_var, momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
示例#34
0
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5):
    """
    Batch normalization layer as described in:

    `Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.

    :param input: a NHWC or NC tensor
    :param use_local_stat: bool. whether to use mean/var of this batch or the moving average.
        Default to True in training and False in inference.
    :param decay: decay rate. default to 0.9.
    :param epsilon: default to 1e-5.
    """

    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]

    n_out = shape[-1]  # channel
    assert n_out is not None
    beta = tf.get_variable('beta', [n_out], initializer=tf.zeros_initializer)
    gamma = tf.get_variable('gamma', [n_out], initializer=tf.ones_initializer)

    if len(shape) == 2:
        batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False)
    else:
        batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False)
    # just to make a clear name.
    batch_mean = tf.identity(batch_mean, 'mean')
    batch_var = tf.identity(batch_var, 'variance')

    emaname = 'EMA'
    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    if use_local_stat != ctx.is_training:
        logger.warn("[BatchNorm] use_local_stat != is_training")

    if use_local_stat:
        # training tower
        with tf.name_scope(
                None):  # https://github.com/tensorflow/tensorflow/issues/2740
            ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname)
            ema_apply_op = ema.apply([batch_mean, batch_var])
            ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var)
            if ctx.is_main_training_tower:
                # inside main training tower
                add_model_variable(ema_mean)
                add_model_variable(ema_var)
    else:
        if ctx.is_main_tower:
            # not training, but main tower. need to create the vars
            with tf.name_scope(None):
                ema = tf.train.ExponentialMovingAverage(decay=decay,
                                                        name=emaname)
                ema_apply_op = ema.apply([batch_mean, batch_var])
                ema_mean, ema_var = ema.average(batch_mean), ema.average(
                    batch_var)
        else:
            # use statistics in another tower
            G = tf.get_default_graph()
            # figure out the var name
            with tf.name_scope(None):
                ema = tf.train.ExponentialMovingAverage(decay=decay,
                                                        name=emaname)
                mean_var_name = ema.average_name(batch_mean) + ':0'
                var_var_name = ema.average_name(batch_var) + ':0'
            ema_mean = ctx.find_tensor_in_main_tower(G, mean_var_name)
            ema_var = ctx.find_tensor_in_main_tower(G, var_var_name)
            #logger.info("In prediction, using {} instead of {} for {}".format(
            #mean_name, ema_mean.name, batch_mean.name))

    if use_local_stat:
        with tf.control_dependencies([ema_apply_op]):
            batch = tf.cast(tf.shape(x)[0], tf.float32)
            mul = tf.select(tf.equal(batch, 1.0), 1.0, batch / (batch - 1))
            batch_var = batch_var * mul  # use unbiased variance estimator in training
            return tf.nn.batch_normalization(x, batch_mean, batch_var, beta,
                                             gamma, epsilon, 'output')
    else:
        return tf.nn.batch_normalization(x, ema_mean, ema_var, beta, gamma,
                                         epsilon, 'output')
示例#35
0
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5,
              use_scale=True, use_bias=True,
              gamma_init=tf.constant_initializer(1.0),
              data_format='channels_last',
              internal_update=False):
    """
    Batch Normalization layer, as described in the paper:
    `Batch Normalization: Accelerating Deep Network Training by
    Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_.

    Args:
        x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
        use_local_stat (bool): whether to use mean/var of the current batch or the moving average.
            Defaults to True in training and False in inference.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.
        gamma_init: initializer for gamma (the scale).
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            which will be slightly slower.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``use_local_stat`` and ``ctx.is_training``:
            * ``use_local_stat == is_training``: standard BN, EMA are
                maintained during training and used during inference.
            * ``use_local_stat and not is_training``: still use local (batch)
                statistics in inference.
            * ``not use_local_stat and is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    data_format = get_data_format(data_format, tfmode=False)
    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchNorm cannot have unknown channels!"
    beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, use_scale, use_bias, gamma_init)

    ctx = get_current_tower_context()
    if use_local_stat is None:
        use_local_stat = ctx.is_training
    use_local_stat = bool(use_local_stat)

    if use_local_stat:
        if ndims == 2:
            x = tf.reshape(x, [-1, 1, 1, n_out])    # fused_bn only takes 4D input
            # fused_bn has error using NCHW? (see #190)

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
            x, gamma, beta, epsilon=epsilon,
            is_training=True, data_format=data_format)

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])
    else:
        if ctx.is_training:
            assert get_tf_version_number() >= 1.4, \
                "Fine tuning a BatchNorm model with fixed statistics is only " \
                "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
            if ctx.is_main_training_tower:  # only warn in first tower
                logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
            # Using moving_mean/moving_variance in training, which means we
            # loaded a pre-trained BN and only fine-tuning the affine part.
            xn, _, _ = tf.nn.fused_batch_norm(
                x, gamma, beta,
                mean=moving_mean, variance=moving_var, epsilon=epsilon,
                data_format=data_format, is_training=False)
        else:
            if ndims == 4:
                xn, _, _ = tf.nn.fused_batch_norm(
                    x, gamma, beta,
                    mean=moving_mean, variance=moving_var, epsilon=epsilon,
                    data_format=data_format, is_training=False)
            else:
                # avoid the reshape if possible (when channel is the last dimension)
                xn = tf.nn.batch_normalization(
                    x, moving_mean, moving_var, beta, gamma, epsilon)

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
    if ctx.is_main_training_tower and use_local_stat:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay, internal_update)
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if use_scale:
        vh.gamma = gamma
    if use_bias:
        vh.beta = beta
    return ret
示例#36
0
文件: ops.py 项目: q7800067/DSNet
def sync_batch_norm(inputs,
                    decay=0.999,
                    center=True,
                    scale=False,
                    epsilon=0.001,
                    activation_fn=None,
                    updates_collections=tf.GraphKeys.UPDATE_OPS,
                    is_training=True,
                    reuse=None,
                    variables_collections=None,
                    outputs_collections=None,
                    trainable=True,
                    scope=None,
                    num_dev=1):
  '''
  num_dev is how many gpus you use.
  '''
  

  from tensorflow.contrib.nccl.ops import gen_nccl_ops
  from tensorflow.contrib.framework import add_model_variable

  red_axises = [0, 1, 2]
  num_outputs = inputs.get_shape().as_list()[-1]

  if scope is None:
    scope = 'BatchNorm'

  layer_variable_getter = _build_variable_getter()
  with variable_scope.variable_scope(
      scope,
      'BatchNorm',
      reuse=reuse,
      custom_getter=layer_variable_getter) as sc:

    gamma = tf.get_variable(name='gamma', shape=[num_outputs], dtype=tf.float32,
                            initializer=tf.constant_initializer(1.0), trainable=trainable,
                            collections=variables_collections)

    beta  = tf.get_variable(name='beta', shape=[num_outputs], dtype=tf.float32,
                            initializer=tf.constant_initializer(0.0), trainable=trainable,
                            collections=variables_collections)

    moving_mean = tf.get_variable(name='moving_mean', shape=[num_outputs], dtype=tf.float32,
                                initializer=tf.constant_initializer(0.0), trainable=False,
                                collections=variables_collections)
                                
    moving_var = tf.get_variable(name='moving_variance', shape=[num_outputs], dtype=tf.float32,
                                initializer=tf.constant_initializer(1.0), trainable=False,
                                collections=variables_collections)

    if is_training and trainable:
      
      if num_dev == 1:
        mean, var = tf.nn.moments(inputs, red_axises)
      else:
        shared_name = tf.get_variable_scope().name
        batch_mean        = tf.reduce_mean(inputs, axis=red_axises)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axises)
        batch_mean        = gen_nccl_ops.nccl_all_reduce(
          input=batch_mean,
          reduction='sum',
          num_devices=num_dev,
          shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
        batch_mean_square = gen_nccl_ops.nccl_all_reduce(
          input=batch_mean_square,
          reduction='sum',
          num_devices=num_dev,
          shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        mean              = batch_mean
        var               = batch_mean_square - tf.square(batch_mean)
      outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma, epsilon)

      if int(outputs.device[-1])== 0:
        update_moving_mean_op = tf.assign(moving_mean, moving_mean * decay + mean * (1 - decay))
        update_moving_var_op  = tf.assign(moving_var,  moving_var  * decay + var  * (1 - decay))
        add_model_variable(moving_mean)
        add_model_variable(moving_var)
        
        if updates_collections is None:
          with tf.control_dependencies([update_moving_mean_op, update_moving_var_op]):
            outputs = tf.identity(outputs)
        else:
          ops.add_to_collections(updates_collections, update_moving_mean_op)
          ops.add_to_collections(updates_collections, update_moving_var_op)
          outputs = tf.identity(outputs)
      else:
        outputs = tf.identity(outputs)

    else:
      outputs,_,_ = nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, is_training=False)

    if activation_fn is not None:
      outputs = activation_fn(outputs)

    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)