def convolutional(input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True): with tf.variable_scope(name): if downsample: pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1 paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]]) input_data = tf.pad(input_data, paddings, 'CONSTANT') strides = (1, 2, 2, 1) padding = 'VALID' else: strides = (1, 1, 1, 1) padding = "SAME" weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True, shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01)) conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding) if bn: conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(), moving_variance_initializer=tf.ones_initializer(), training=trainable) else: bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True, dtype=tf.float32, initializer=tf.constant_initializer(0.0)) conv = tf.nn.bias_add(conv, bias) if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1) return conv
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): """Batch normalization on `input_layer` without tf.layers.""" # We make this function as similar as possible to the # tf.contrib.layers.batch_norm, to minimize the differences between using # layers and not using layers. shape = input_layer.shape num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32, initializer=tf.zeros_initializer()) if use_scale: gamma = self.get_variable('gamma', [num_channels], tf.float32, tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, tf.float32, [num_channels]) # For moving variables, we use tf.get_variable instead of self.get_variable, # since self.get_variable returns the result of tf.cast which we cannot # assign to. moving_mean = tf.get_variable('moving_mean', [num_channels], tf.float32, initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable('moving_variance', [num_channels], tf.float32, initializer=tf.ones_initializer(), trainable=False) if self.phase_train: bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( input_layer, gamma, beta, epsilon=epsilon, data_format=self.data_format, is_training=True) mean_update = moving_averages.assign_moving_average( moving_mean, batch_mean, decay=decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( moving_variance, batch_variance, decay=decay, zero_debias=False) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) else: bn, _, _ = tf.nn.fused_batch_norm(input_layer, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, data_format=self.data_format, is_training=False) return bn
def separable_conv_block(self, input, dw_filter, output_channel, strides, name): """ Params: input: filter: a 4-D tuple: [filter_width, filter_height, in_channels, multiplier] output_channel: output channel of the separable_conv_block strides: a 4-D list: [1,strides,strides,1] """ with tf.variable_scope(name): dw_weight = tf.get_variable( name='dw_filter', dtype=tf.float32, trainable=True, shape=dw_filter, initializer=tf.random_normal_initializer(stddev=0.01)) dw = tf.nn.depthwise_conv2d(input=input, filter=dw_weight, strides=strides, padding="SAME", name='Conv/dw') bn_dw = tf.layers.batch_normalization( dw, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(), moving_variance_initializer=tf.ones_initializer(), training=self.trainable, name='dw/bn') relu = tf.nn.leaky_relu(bn_dw, 0.1) weight = tf.get_variable( name='weight', dtype=tf.float32, trainable=True, shape=(1, 1, dw_filter[2] * dw_filter[3], output_channel), initializer=tf.random_normal_initializer(stddev=0.01)) conv = tf.nn.conv2d(input=relu, filter=weight, strides=[1, 1, 1, 1], padding="SAME", name="conv/s1") bn_pt = tf.layers.batch_normalization( conv, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(), moving_variance_initializer=tf.ones_initializer(), training=self.trainable, name='pt/bn') return tf.nn.leaky_relu(bn_pt, 0.1)
def _bn(x, is_train, global_step=None, name='bn'): moving_average_decay = 0.9 # moving_average_decay = 0.99 # moving_average_decay_init = 0.99 with tf.variable_scope(name): decay = moving_average_decay # if global_step is None: # decay = moving_average_decay # else: # decay = tf.cond(tf.greater(global_step, 100) # , lambda: tf.constant(moving_average_decay, tf.float32) # , lambda: tf.constant(moving_average_decay_init, tf.float32)) batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2]) with tf.device('/CPU:0'): mu = tf.get_variable('mu', batch_mean.get_shape(), tf.float32, initializer=tf.zeros_initializer(), trainable=False) sigma = tf.get_variable('sigma', batch_var.get_shape(), tf.float32, initializer=tf.ones_initializer(), trainable=False) beta = tf.get_variable('beta', batch_mean.get_shape(), tf.float32, initializer=tf.zeros_initializer()) gamma = tf.get_variable('gamma', batch_var.get_shape(), tf.float32, initializer=tf.ones_initializer()) # BN when training update = 1.0 - decay # with tf.control_dependencies([tf.Print(decay, [decay])]): # update_mu = mu.assign_sub(update*(mu - batch_mean)) update_mu = mu.assign_sub(update * (mu - batch_mean)) update_sigma = sigma.assign_sub(update * (sigma - batch_var)) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mu) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_sigma) mean, var = tf.cond(is_train, lambda: (batch_mean, batch_var), lambda: (mu, sigma)) bn = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-5) # bn = tf.nn.batch_normalization(x, batch_mean, batch_var, beta, gamma, 1e-5) # bn = tf.contrib.layers.batch_norm(inputs=x, decay=decay, # updates_collections=[tf.GraphKeys.UPDATE_OPS], center=True, # scale=True, epsilon=1e-5, is_training=is_train, # trainable=True) return bn
def batch_norm(inputs, bn_param, scale=True, momentum=0.99, epsilon=1e-5, name='batch_norm'): with tf.variable_scope(name): beta = _variable('beta', [inputs.get_shape()[-1]], initializer=tf.zeros_initializer(), trainable=True) if scale: gamma = _variable('gamma', [inputs.get_shape()[-1]], initializer=tf.ones_initializer(), trainable=True) else: gamma = None reduced_dim = [i for i in range(len(inputs.get_shape()) - 1)] batch_mean, batch_var = tf.nn.moments(inputs, reduced_dim, keep_dims=False) # moving average of the populations pop_mean = _variable('pop_mean', shape=[inputs.get_shape()[-1]], initializer=tf.zeros_initializer(), trainable=False) pop_var = _variable('pop_var', shape=[inputs.get_shape()[-1]], initializer=tf.ones_initializer(), trainable=False) pop_mean_op = tf.assign( pop_mean, pop_mean * momentum + batch_mean * (1 - momentum)) pop_var_op = tf.assign(pop_var, pop_var * momentum + batch_var * (1 - momentum)) tf.add_to_collection('batch_norm_update', pop_mean_op) tf.add_to_collection('batch_norm_update', pop_var_op) # for training, bn_param[0]=0 # for evaluation, bn_param[0]=1 mean = bn_param[0] * pop_mean + (1 - bn_param[0]) * batch_mean var = bn_param[0] * pop_var + (1 - bn_param[0]) * batch_var return tf.nn.batch_normalization(inputs, mean, var, beta, gamma, epsilon)
def decoder(self, z): nl = tf.nn.leaky_relu z_has_timesteps = (z.get_shape().ndims == 3) if z_has_timesteps: sh = tf.shape(z) z = flatten_two_dims(z) with tf.variable_scope(self.scope + "decoder"): z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32, initializer=tf.ones_initializer()) scale = tf.maximum(scale, -4.) scale = tf.nn.softplus(scale) scale = scale * tf.ones_like(z) else: z, scale = tf.split(z, 2, -1) scale = tf.nn.softplus(scale) # scale = tf.Print(scale, [scale]) return tf.distributions.Normal(loc=z, scale=scale)
def primer_norm(x, dim, epsilon=1e-6, name="layer_prepostprocess"): """Primer normalization over dimension `dim`. Args: x: a mtf.Tensor whose shape contains `dim`. dim: a mtf.Dimension. epsilon: a floating point number. name: a string used for tf.variable_scope. Returns: a mtf.Tensor with same shape as x. """ with tf.variable_scope(name + "/primer_norm"): scale = mtf.get_variable(x.mesh, "primer_norm_scale", mtf.Shape([dim]), initializer=tf.ones_initializer(), activation_dtype=x.dtype) bias = mtf.get_variable(x.mesh, "primer_norm_bias", mtf.Shape([dim]), initializer=tf.zeros_initializer(), activation_dtype=x.dtype) reduced_shape = x.shape - dim mean = mtf.reduce_mean(x, output_shape=reduced_shape) mean_centered_x = x - mean pseudo_variance = mtf.reduce_mean(x * mean_centered_x, output_shape=reduced_shape) norm_x = mean_centered_x * mtf.rsqrt(pseudo_variance + epsilon) return norm_x * scale + bias
def batch_norm_relu(inputs, is_training, relu=True, init_zero=False, data_format='channels_last'): """Performs a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training: `bool` for whether the model is training. relu: `bool` if False, omits the ReLU operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A normalized `Tensor` with the same `data_format`. """ del data_format if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() var = { 'beta': None, 'gamma': None, 'moving_mean': ['moving_vars'], 'moving_variance': ['moving_vars'], }
def batch_norm_relu(inputs, is_training, relu=True, init_zero=False, data_format='channels_first'): """Performs a batch normalization followed by a ReLU.""" if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = 3 inputs = tf.layers.batch_normalization(inputs=inputs, axis=axis, momentum=0.9, epsilon=1e-5, center=True, scale=True, training=is_training, fused=True, gamma_initializer=gamma_initializer) if relu: inputs = tf.nn.relu(inputs) return inputs
def apply_norm(x, epsilon=1e-6): """Applies layer normalization to x. Based on "Layer Normalization": https://arxiv.org/abs/1607.06450 Args: x: <float>[..., input_size] epsilon: Used to avoid division by 0. Returns: <float>[..., input_size] """ input_size = x.get_shape()[-1] with tf.variable_scope("layer_norm", values=[x]): scale = tf.get_variable("layer_norm_scale", [input_size], initializer=tf.ones_initializer()) bias = tf.get_variable("layer_norm_bias", [input_size], initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) result = norm_x * scale + bias return result
def batch_norm(x, is_training, bn_decay): input_dims = x.get_shape()[-1].value moment_dims = list(range(len(x.get_shape()) - 1)) beta = tf.Variable(tf.zeros_initializer()(shape=[input_dims]), dtype=tf.float32, trainable=True, name='beta') gamma = tf.Variable(tf.ones_initializer()(shape=[input_dims]), dtype=tf.float32, trainable=True, name='gamma') batch_mean, batch_var = tf.nn.moments(x, moment_dims, name='moments') decay = bn_decay if bn_decay is not None else 0.9 ema = tf.train.ExponentialMovingAverage(decay=decay) # Operator that maintains moving averages of variables. ema_apply_op = tf.cond(is_training, lambda: ema.apply([batch_mean, batch_var]), lambda: tf.no_op()) # Update moving average and return current batch's avg and var. def mean_var_with_update(): with tf.control_dependencies([ema_apply_op]): return tf.identity(batch_mean), tf.identity(batch_var) # ema.average returns the Variable holding the average of var. mean, var = tf.cond( is_training, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) x = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3) return x
def _CreateLSTMPruneVariables(lstm_obj, input_depth, h_depth): """Function to create additional variables for pruning.""" mask = lstm_obj.add_variable(name="mask", shape=[input_depth + h_depth, 4 * h_depth], initializer=tf.ones_initializer(), trainable=False, dtype=lstm_obj.dtype) threshold = lstm_obj.add_variable(name="threshold", shape=[], initializer=tf.zeros_initializer(), trainable=False, dtype=lstm_obj.dtype) # Add old_weights, old_old_weights, gradient for gradient # based pruning. old_weight = lstm_obj.add_variable( name="old_weight", shape=[input_depth + h_depth, 4 * h_depth], initializer=tf.zeros_initializer(), trainable=False, dtype=lstm_obj.dtype) old_old_weight = lstm_obj.add_variable( name="old_old_weight", shape=[input_depth + h_depth, 4 * h_depth], initializer=tf.zeros_initializer(), trainable=False, dtype=lstm_obj.dtype) gradient = lstm_obj.add_variable( name="gradient", shape=[input_depth + h_depth, 4 * h_depth], initializer=tf.zeros_initializer(), trainable=False, dtype=lstm_obj.dtype) return mask, threshold, old_weight, old_old_weight, gradient
def testMinimalRun(self): x = basic.TrainableVariable(shape=(), initializers={'w': tf.ones_initializer()})() x2 = x**2.0 min_value = 0.5 constr = optimization_constraints.OptimizationConstraints().add( x > min_value) self.assertFalse(constr._is_connected) loss = moving_average.MovingAverage()(x2 + tf.random.normal( (), stddev=1.0)) + constr() self.assertTrue(constr._is_connected) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add(x > min_value) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add_geq(x, min_value) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add_leq(min_value < x) opt = tf.train.AdamOptimizer(1e-2, beta1=0.0) update = opt.minimize(loss) with tf.control_dependencies([update]): x2 = tf.identity(x2) with tf.train.MonitoredSession() as sess: for _ in range(500): v, _ = sess.run([x2, update]) self.assertAllClose(v, min_value**2)
def batch_norm_relu(inputs, is_training, relu=True, init_zero=False, center=True, scale=True, data_format='channels_first'): """Performs a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training: `bool` for whether the model is training. relu: `bool` if False, omits the ReLU operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). center: `bool` whether to add learnable bias factor. scale: `bool` whether to add learnable scaling factor. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = 3 if FLAGS.global_bn: bn_foo = BatchNormalization(axis=axis, momentum=FLAGS.batch_norm_decay, epsilon=BATCH_NORM_EPSILON, center=center, scale=scale, fused=False, gamma_initializer=gamma_initializer) inputs = bn_foo(inputs, training=is_training) else: print("usng the default batch norm, not global normalized one") # Done: pass in the batch_norm_decay inputs = tf.layers.batch_normalization( inputs=inputs, axis=axis, momentum=FLAGS.batch_norm_decay, epsilon=BATCH_NORM_EPSILON, center=center, scale=scale, training=is_training, fused=True, gamma_initializer=gamma_initializer) if relu: inputs = tf.nn.relu(inputs) return inputs
def layer_norm_vars(filters, layer_idx, total_layers): """Create Variables for layer norm.""" if total_layers == 0: scale = tf.get_variable("gamma", filters, initializer=tf.ones_initializer()) bias = tf.get_variable("beta", filters, initializer=tf.zeros_initializer()) else: scale = tf.get_variable("gamma", [total_layers, filters], initializer=tf.ones_initializer()) bias = tf.get_variable("beta", [total_layers, filters], initializer=tf.zeros_initializer()) scale = tf.gather(scale, layer_idx) bias = tf.gather(bias, layer_idx) return scale, bias
def layer_norm_op(inputs, norm_shape=None, begin_norm_axis=-1, center=True, scale=True, activation_fn=None, reuse=None, trainable=True, name=None): """Custom Layer Normalization layer.""" if norm_shape is None: # If `norm_shape` is not provided, use `begin_norm_axis` to infer norm_shape = inputs.shape[begin_norm_axis:] elif isinstance(norm_shape, int): # If `norm_shape` is provided as int, convert it to list norm_shape = [norm_shape] with tf.variable_scope(name, "layer_norm", [inputs], reuse=reuse): inputs_rank = inputs.shape.ndims if inputs_rank is None: raise ValueError("Inputs %s has undefined rank." % inputs.name) dtype = inputs.dtype.base_dtype # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = tf.get_variable( "beta", shape=norm_shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=trainable) if scale: gamma = tf.get_variable( "gamma", shape=norm_shape, dtype=dtype, initializer=tf.ones_initializer(), trainable=trainable) # By default, compute the moments across all the dimensions except the one # with index 0. norm_axes = list(range(inputs_rank - len(norm_shape), inputs_rank)) mean, variance = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Compute layer normalization using the batch_normalization function. # Note that epsilon must be increased for float16 due to the limited # representable range. variance_epsilon = 1e-8 if dtype != tf.float16 else 1e-3 outputs = tf.nn.batch_normalization( inputs, mean, variance, offset=beta, scale=gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs.shape) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def batch_norm_dist_template(inputs, is_training, scope, moments_dims, bn_decay): """ The batch normalization for distributed training. Args: inputs: Tensor, k-D input ... x C could be BC or BHWC or BDHWC is_training: boolean tf.Varialbe, true indicates training phase scope: string, variable scope moments_dims: a list of ints, indicating dimensions for moments calculation bn_decay: float or float tensor variable, controling moving average weight Return: normed: batch-normalized maps """ with tf.variable_scope(scope) as sc: num_channels = inputs.get_shape()[-1].value beta = _variable_on_cpu('beta', [num_channels], initializer=tf.zeros_initializer()) gamma = _variable_on_cpu('gamma', [num_channels], initializer=tf.ones_initializer()) pop_mean = _variable_on_cpu('pop_mean', [num_channels], initializer=tf.zeros_initializer(), trainable=False) pop_var = _variable_on_cpu('pop_var', [num_channels], initializer=tf.ones_initializer(), trainable=False) def train_bn_op(): batch_mean, batch_var = tf.nn.moments(inputs, moments_dims, name='moments') decay = bn_decay if bn_decay is not None else 0.9 train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay)) train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay)) with tf.control_dependencies([train_mean, train_var]): return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, 1e-3) def test_bn_op(): return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, 1e-3) normed = tf.cond(is_training, train_bn_op, test_bn_op) return normed
def __init__(self, num_actions, quantile_embedding_dim, **kwargs): # This weights_initializer gives action 0 a higher weight, ensuring # that it gets picked by the argmax. super(MockImplicitQuantileNetwork, self).__init__(**kwargs) self.num_actions = num_actions self.layer = tf.keras.layers.Dense( self.num_actions, kernel_initializer=tf.ones_initializer(), bias_initializer=tf.zeros_initializer())
def layer_norm(inputs, epsilon=1e-8): mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) normalized = (inputs - mean) / (tf.sqrt(variance + epsilon)) params_shape = inputs.get_shape()[-1:] gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer()) beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer()) return gamma * normalized + beta
def testInitializers(self): inputs = tf.ones(dtype=tf.float32, shape=[self.batch_size, self.in_size]) prev_state = tf.ones(dtype=tf.float32, shape=[self.batch_size, self.hidden_size]) with self.assertRaisesRegexp(KeyError, "Invalid initializer keys.*"): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, initializers={"invalid": None}) err = "Initializer for 'w' is not a callable function" with self.assertRaisesRegexp(TypeError, err): snt.VanillaRNN( name="rnn", hidden_size=self.hidden_size, initializers={"in_to_hidden": { "w": tf.zeros([10, 10]) }}) # Nested initializer. valid_initializers = { "in_to_hidden": { "w": tf.ones_initializer(), }, "hidden_to_hidden": { "b": tf.ones_initializer(), } } vanilla_rnn = snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, initializers=valid_initializers) vanilla_rnn(inputs, prev_state) init = tf.global_variables_initializer() self.evaluate(init) w_v, b_v = self.evaluate([ vanilla_rnn.in_to_hidden_linear.w, vanilla_rnn.hidden_to_hidden_linear.b, ]) self.assertAllClose(w_v, np.ones([self.in_size, self.hidden_size])) self.assertAllClose(b_v, np.ones([self.hidden_size]))
def __init__(self, num_actions, **kwargs): # This weights_initializer gives action 0 a higher weight, ensuring # that it gets picked by the argmax. super(MockDQNNetwork, self).__init__(**kwargs) weights_initializer = np.tile( np.arange(num_actions, 0, -1), (stack_size, 1)) self.layer = tf.keras.layers.Dense( num_actions, kernel_initializer=tf.constant_initializer(weights_initializer), bias_initializer=tf.ones_initializer())
def __init__(self, dim, eps=1e-5, name=None): super(LayerNorm, self).__init__(name=name) self.eps = eps with self.variable_scope: self.g = tf.get_variable('g', shape=[dim], initializer=tf.ones_initializer()) self.b = tf.get_variable('b', shape=[dim], initializer=tf.zeros_initializer())
def batch_norm_act(inputs, is_training_bn: bool, act_type: Union[Text, None], init_zero: bool = False, data_format: Text = 'channels_last', momentum: float = 0.99, epsilon: float = 1e-3, strategy: Text = None, name: Text = None, batch_norm_trainable: bool = True): """Performs a batch normalization followed by a non-linear activation. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training_bn: `bool` for whether the model is training. act_type: non-linear relu function type. If None, omits the relu operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. momentum: `float`, momentume of batch norm. epsilon: `float`, small value for numerical stability. strategy: string to specify training strategy for TPU/GPU/CPU. name: the name of the batch normalization layer batch_norm_trainable: 'bool' if False, the batch statistics will not be updated. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = 3 inputs = batch_normalization(inputs=inputs, axis=axis, momentum=momentum, epsilon=epsilon, center=True, scale=True, trainable=batch_norm_trainable, training=is_training_bn, strategy=strategy, gamma_initializer=gamma_initializer, name=name) if act_type: inputs = activation_fn(inputs, act_type) return inputs
def batch_norm_relu(inputs, is_training, relu=True, swish=False, init_zero=False, bn_decay=BATCH_NORM_DECAY, bn_epsilon=BATCH_NORM_EPSILON, data_format='channels_first'): """Performs a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training: `bool` for whether the model is training. relu: `bool` if False, omits the ReLU operation. swish: `bool`. True to use swish activation function, False for ReLU. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). bn_decay: `float` batch norm decay parameter to use. bn_epsilon: `float` batch norm epsilon parameter to use. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() assert data_format == 'channels_last' if data_format == 'channels_first': axis = 1 else: axis = -1 inputs = tf.layers.batch_normalization( inputs=inputs, axis=axis, momentum=bn_decay, epsilon=bn_epsilon, center=True, scale=True, training=is_training, fused=True, gamma_initializer=gamma_initializer) if swish: inputs = tf.keras.activations.swish(inputs) elif relu: inputs = tf.nn.relu(inputs) return inputs
def __call__(self, inputs, relu=True, init_zero=False, is_training=False, name=None): """Builds layers for a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. relu: `bool` if False, omits the ReLU operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0. If False, initialize it with 1. is_training: `boolean`, if True if model is in training mode. name: `str` name for the operation. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if self._use_sync_bn: inputs = cross_replica_batch_normalization( inputs=inputs, momentum=self._momentum, epsilon=self._epsilon, center=True, scale=True, training=(is_training and self._trainable), trainable=self._trainable, fused=True, gamma_initializer=gamma_initializer, num_distributed_groups=1, name=name) else: inputs = tf.layers.batch_normalization( inputs=inputs, momentum=self._momentum, epsilon=self._epsilon, center=True, scale=True, training=(is_training and self._trainable), trainable=self._trainable, fused=True, gamma_initializer=gamma_initializer, name=name) if relu: inputs = tf.nn.relu(inputs) return inputs
def addBiases(self, inp1, inp2, dim, bInitial = 0, name = ""): with tf.variable_scope("additiveBiases" + name): b = tf.get_variable("biases", shape = (dim,), initializer = tf.zeros_initializer()) + bInitial with tf.variable_scope("multiplicativeBias" + name): beta = tf.get_variable("biases", shape = (3 * dim,), initializer = tf.ones_initializer()) Wx, Uh, inter = tf.split(beta * tf.concat([inp1, inp2, inp1 * inp2], axis = 1), num_or_size_splits = 3, axis = 1) output = Wx + Uh + inter + b return output
def batch_norm_relu(inputs, is_training_bn, relu=True, init_zero=False, data_format='channels_last', momentum=0.99, epsilon=1e-3, use_tpu=False, name=None): """Performs a batch normalization followed by a ReLU. Args: inputs: `Tensor` of shape `[batch, channels, ...]`. is_training_bn: `bool` for whether the model is training. relu: `bool` if False, omits the ReLU operation. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. momentum: `float`, momentume of batch norm. epsilon: `float`, small value for numerical stability. use_tpu: `bool`, whether to use tpu version of batch norm. name: the name of the batch normalization layer Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = 3 inputs = tpu_batch_normalization( inputs=inputs, axis=axis, momentum=momentum, epsilon=epsilon, center=True, scale=True, training=is_training_bn, use_tpu=use_tpu, gamma_initializer=gamma_initializer, name=name) if relu: inputs = relu_fn(inputs) return inputs
def get_initializer(initializer, dtype): if initializer == 'zeros': return tf.zeros_initializer(dtype=dtype) elif initializer == 'ones': return tf.ones_initializer(dtype=dtype) elif initializer == 'vs': return tf.variance_scaling_initializer(dtype=dtype) elif initializer == 'xavier': return tf.glorot_normal_initializer(dtype=dtype) elif initializer == 'he': return tf.variance_scaling_initializer(dtype=dtype) else: raise NotImplementedError
def Conv2D(inputs, filters, name, strides=[1, 1, 1, 1], padding="SAME", pooling=None, activation="leaky", trainable=True, bn=False): with tf.variable_scope(name): weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True, shape=filters, initializer=tf.random_normal_initializer(stddev=0.01)) output = tf.nn.conv2d(inputs, weight, strides=strides, padding=padding) if pooling == "max": output = tf.nn.max_pool(output, ksize=[1], strides=[1, 2, 2, 1], padding="SAME") elif pooling == "avg": output = tf.nn.avg_pool(output, ksize=[1], strides=[1, 2, 2, 1], padding="SAME") if bn: conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(), moving_variance_initializer=tf.ones_initializer(), training=trainable) if activation == "leaky": output = tf.nn.leaky_relu(output) elif activation == "relu": output = tf.nn.leaky_relu(output) return output
def _create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers): """ Builds a set of hidden state encoders. :param s_size: state input size. :param h_size: Hidden layer size. :param num_streams: Number of state streams to construct. :param activation: What type of activation function to use for layers. :return: List of hidden layer tensors. """ self.state_in = tf.placeholder(shape=[None, s_size[0], s_size[1], s_size[2]], dtype=tf.float32, name='state') if self.normalize > 0: self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32, initializer=tf.zeros_initializer()) self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer()) self.norm_running_variance = tf.get_variable("norm_running_variance", [s_size], trainable=False, dtype=tf.float32, initializer=tf.ones_initializer()) self.normalized_state = tf.clip_by_value( (self.state_in - self.running_mean) / tf.sqrt(self.norm_running_variance), -5, 5, name="normalized_state") self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean') self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance') self.update_mean = tf.assign(self.running_mean, self.new_mean) self.update_variance = tf.assign(self.running_variance, self.new_variance) self.update_norm_variance = tf.assign(self.norm_running_variance, self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)) else: self.normalized_state = self.state_in streams = [] for i in range(num_streams): hidden = self.normalized_state for j in range(num_layers): hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation) streams.append(hidden) return streams