def __init__(self, relu=True, init_zero=False, center=True, scale=True, data_format='channels_last', **kwargs): super(BatchNormRelu, self).__init__(**kwargs) self.relu = relu if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = -1 self.bn = tf.keras.layers.BatchNormalization( axis=axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON, center=center, scale=scale, fused=False, gamma_initializer=gamma_initializer)
def test_shared_sequence_non_sequence_into_input_layer(self): non_seq = tf.feature_column.categorical_column_with_identity( 'non_seq', num_buckets=10) seq = tf.feature_column.sequence_categorical_column_with_identity( 'seq', num_buckets=10) shared_non_seq, shared_seq = tf.feature_column.shared_embeddings( [non_seq, seq], dimension=4, combiner='sum', initializer=tf.ones_initializer(), shared_embedding_collection_name='shared') seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) non_seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]], values=[0, 1, 2], dense_shape=[2, 2]) features = {'seq': seq, 'non_seq': non_seq} # Tile the context features across the sequence features seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features) non_seq_input = dense_features.DenseFeatures([shared_non_seq ])(features) with self.cached_session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) output_seq, output_seq_length, output_non_seq = sess.run( [seq_input, seq_length, non_seq_input]) self.assertAllEqual( output_seq, [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]]) self.assertAllEqual(output_seq_length, [2, 1]) self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
def __init__(self, num_groups=None, group_size=None, eps=1e-5, beta_init=tf.zeros_initializer(), gamma_init=tf.ones_initializer(), **kwargs): """Initializer. Args: num_groups: int, the number of channel-groups to normalize over. group_size: int, size of the groups to normalize over. eps: float, a small additive constant to avoid /sqrt(0). beta_init: initializer for bias, defaults to zeros. gamma_init: initializer for scale, defaults to ones. **kwargs: other tf.keras.layers.Layer arguments. """ super(GroupNormalization, self).__init__(**kwargs) if num_groups is None and group_size is None: num_groups = 32 self._num_groups = num_groups self._group_size = group_size self._eps = eps self._beta_init = beta_init self._gamma_init = gamma_init
def __init__(self, hdim, dtype=tf.float32, name="LayerNorm"): super(NormLayer, self).__init__(name=name) self._dtype = dtype with tf.compat.v1.variable_scope(name): self.beta = tf.compat.v1.get_variable( "beta", [hdim], dtype=dtype, initializer=tf.zeros_initializer()) self.gamma = tf.compat.v1.get_variable( "gamma", [hdim], dtype=dtype, initializer=tf.ones_initializer())
def __init__(self, relu=True, init_zero=False, center=True, scale=True, data_format='channels_last', **kwargs): super(BatchNormRelu, self).__init__(**kwargs) self.relu = relu if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_first': axis = 1 else: axis = -1 if FLAGS.global_bn: # TODO(srbs): Set fused=True # Batch normalization layers with fused=True only support 4D input # tensors. self.bn = tf.keras.layers.experimental.SyncBatchNormalization( axis=axis, momentum=FLAGS.batch_norm_decay, epsilon=BATCH_NORM_EPSILON, center=center, scale=scale, gamma_initializer=gamma_initializer) else: # TODO(srbs): Set fused=True # Batch normalization layers with fused=True only support 4D input # tensors. self.bn = tf.keras.layers.BatchNormalization( axis=axis, momentum=FLAGS.batch_norm_decay, epsilon=BATCH_NORM_EPSILON, center=center, scale=scale, fused=False, gamma_initializer=gamma_initializer)
def call(self, input_tensor): inputs = tf.convert_to_tensor(input_tensor) inputs_shape = get_shape_list(inputs) inputs_rank = len(inputs_shape) dtype = inputs.dtype.base_dtype norm_axis = inputs_rank - 1 params_shape = [inputs_shape[norm_axis]] # Allocate parameters for the beta and gamma of the normalization. if self.beta is None: self.beta = tf.compat.v1.get_variable( "beta", shape=params_shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=True) self._trainable_weights.append(self.beta) if self.gamma is None: self.gamma = tf.compat.v1.get_variable( "gamma", shape=params_shape, dtype=dtype, initializer=tf.ones_initializer(), trainable=True) self._trainable_weights.append(self.gamma) # Compute norm along last axis mean, variance = tf.nn.moments(inputs, [norm_axis], keepdims=True) # Compute layer normalization using the batch_normalization function. # Note that epsilon must be increased for float16 due to the limited # representable range. variance_epsilon = 1e-12 if dtype != tf.float16 else 1e-3 outputs = tf.nn.batch_normalization(inputs, mean, variance, offset=self.beta, scale=self.gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs_shape) return outputs
def test_generate_candidates(self, want_names, want_subnetwork_losses, want_mixture_weight_losses, want_complexities, learn_mixture_weights=False, initial_num_layers=0, previous_ensemble=None): feature_columns = [tf.feature_column.numeric_column("x")] generator = simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.compat.v1.train.GradientDescentOptimizer(.1), layer_size=3, initial_num_layers=initial_num_layers, learn_mixture_weights=learn_mixture_weights, seed=42) with context.graph_mode(), tf.Graph().as_default() as g: iteration_step = tf.compat.v1.train.create_global_step() features = {"x": [[1.], [2.]]} labels = tf.constant([[0.], [1.]]) names = [] subnetwork_losses = [] mixture_weight_losses = [] complexities = [] for builder in generator.generate_candidates( previous_ensemble, # The following arguments are not used by # simple_dnn.BuilderGenerator's generate_candidates. iteration_number=0, previous_ensemble_reports=[], all_reports=[]): names.append(builder.name) # 1. Build subnetwork graph. subnetwork = builder.build_subnetwork( features, logits_dimension=1, training=True, iteration_step=iteration_step, summary=tf.summary, previous_ensemble=previous_ensemble) # 2. Build subnetwork train ops. subnetwork_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=subnetwork.logits, labels=labels)) subnetwork_train_op = builder.build_subnetwork_train_op( subnetwork, subnetwork_loss, var_list=None, labels=labels, iteration_step=iteration_step, summary=tf.summary, previous_ensemble=None) # 3. Build mixture weight train ops. # Stop gradients since mixture weights should have not propagate # beyond top layer. subnetwork_logits = tf.stop_gradient(subnetwork.logits) # Mixture weight will initialize to a one-valued scalar. mixture_weight_logits = tf.compat.v1.layers.dense( subnetwork_logits, units=1, use_bias=False, kernel_initializer=tf.ones_initializer()) mixture_weight_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=mixture_weight_logits, labels=labels)) mixture_weight_train_op = builder.build_mixture_weights_train_op( mixture_weight_loss, var_list=None, labels=labels, logits=mixture_weight_logits, iteration_step=iteration_step, summary=tf.summary) with self.test_session(graph=g) as sess: sess.run(tf.compat.v1.global_variables_initializer()) sess.run(subnetwork_train_op) sess.run(mixture_weight_train_op) subnetwork_losses.append(sess.run(subnetwork_loss)) mixture_weight_losses.append(sess.run(mixture_weight_loss)) complexities.append(sess.run(subnetwork.complexity)) self.assertEqual(want_names, names) self.assertAllClose(want_subnetwork_losses, subnetwork_losses, atol=1e-3) self.assertAllClose(want_mixture_weight_losses, mixture_weight_losses, atol=1e-3) self.assertAllClose(want_complexities, complexities, atol=1e-3)