def get_sentence_order_output(albert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, albert_config.hidden_size], initializer=modeling.create_initializer( albert_config.initializer_range)) output_bias = tf.get_variable("output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def binary_logits(self, hidden, scope="binary", reuse=False): """Compute per-element bianry classification logits.""" net_config = self.net_config initializer = self.get_initializer() with tf.variable_scope("{}_proj".format(scope), reuse=reuse): hidden = ops.dense( hidden, net_config.d_model, activation=ops.get_activation("gelu"), initializer=initializer) with tf.variable_scope("{}_loss".format(scope), reuse=reuse): binary_w = tf.get_variable("weight", [net_config.d_model], dtype=hidden.dtype, initializer=initializer) binary_b = tf.get_variable("bias", [1], dtype=hidden.dtype, initializer=tf.zeros_initializer()) logits = tf.einsum("bid,d->bi", hidden, binary_w) + binary_b if logits.dtype != tf.float32: # Always use float32 for loss logits = tf.cast(logits, tf.float32) return logits
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def _create_user_terms(self, users, N): num_users = self.num_users num_items = self.num_items num_factors = self.num_factors p_u, b_u = super(SVDPP, self)._create_user_terms(users) with tf.variable_scope('user'): implicit_feedback_embeddings = tf.get_variable( name='implict_feedback_embedding', shape=[num_items, num_factors], initializer=tf.zeros_initializer(), regularizer=tf.contrib.layers.l2_regularizer(self.reg_y_u)) y_u = tf.gather(tf.nn.embedding_lookup_sparse( implicit_feedback_embeddings, N, sp_weights=None, combiner='sqrtn'), users, name='y_u') return p_u, b_u, y_u
def _build_tiled_linear(self, inputs, input_name_and_sizes, output_name_and_sizes, add_bias): results = [] for output_name, output_size in output_name_and_sizes: r = 0.0 for input_, (input_name, input_size) in zip(inputs, input_name_and_sizes): name = 'W_{}_{}'.format(input_name, output_name) weight = self._get_variable(name, shape=[output_size, input_size]) r += tf.sparse_tensor_dense_matmul(weight, input_, adjoint_b=True) r = tf.transpose(r) if add_bias: # Biases are dense, hence we call _get_variable of the base # class. r += super(SparseTiledLinear, self)._get_variable( 'B_{}'.format(output_name), shape=[output_size], default_initializer=tf.zeros_initializer()) results.append(r) return results
def get_data_and_params(): """Set up input dataset and variables.""" (train_x, train_y), _ = tf.keras.datasets.mnist.load_data() tf.set_random_seed(0) hparams = contrib_training.HParams( batch_size=200, learning_rate=0.1, train_steps=101, ) dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)) dataset = dataset.repeat() dataset = dataset.shuffle(hparams.batch_size * 10) dataset = dataset.batch(hparams.batch_size) def reshape_ex(x, y): return (tf.to_float(tf.reshape(x, (-1, 28 * 28))) / 256.0, tf.one_hot(tf.squeeze(y), 10)) dataset = dataset.map(reshape_ex) w = tf.get_variable('w0', (28 * 28, 10)) b = tf.get_variable('b0', (10,), initializer=tf.zeros_initializer()) opt = tf.train.GradientDescentOptimizer(hparams.learning_rate) return dataset, opt, hparams, w, b
def conv(batch_input, out_channels, stride, filterSize=4, initScale=0.02, useXavier=False, paddingSize=1, useBias=False): with tf.variable_scope("conv"): in_height, in_width, in_channels = [ batch_input.get_shape()[1], batch_input.get_shape()[2], int(batch_input.get_shape()[-1]) ] filter = tf.get_variable( "filter", [filterSize, filterSize, in_channels, out_channels], dtype=tf.float32, initializer=tf.random_normal_initializer( 0, np.sqrt(2.0 / (int(in_channels) + int(out_channels))) * initScale) if useXavier else tf.random_normal_initializer( 0, initScale)) padded_input = tf.pad(batch_input, [[0, 0], [paddingSize, paddingSize], [paddingSize, paddingSize], [0, 0]], mode="CONSTANT") #SYMMETRIC conv = tf.nn.conv2d(padded_input, filter, [1, stride, stride, 1], padding="VALID") if useBias: offset = tf.get_variable("offset", [1, 1, 1, out_channels], dtype=tf.float32, initializer=tf.zeros_initializer()) conv = conv + offset return conv
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, trainable=True, name=None, **kwargs): _Conv.__init__(self, filters, kernel_size, strides=strides, padding=padding, data_format=data_format, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, kernel_constraint=kernel_constraint, bias_constraint=bias_constraint, trainable=trainable, name=name, **kwargs) self.neuron_scale = _get_neuron_scale(self.filters, self.kernel_size)
def model_fn(model, features, labels, mode): x = features['x'] print(features, labels, mode) w1f = tf.get_variable('w1f', shape=[28 * 28 / 2, 128], dtype=tf.float32, initializer=tf.random_uniform_initializer( -0.01, 0.01)) b1f = tf.get_variable('b1f', shape=[128], dtype=tf.float32, initializer=tf.zeros_initializer()) act1_f = tf.nn.relu(tf.nn.bias_add(tf.matmul(x, w1f), b1f)) if mode == tf.estimator.ModeKeys.TRAIN: gact1_f = model.send('act1_f', act1_f, require_grad=True) optimizer = tf.train.GradientDescentOptimizer(0.1) train_op = model.minimize( optimizer, act1_f, grad_loss=gact1_f, global_step=tf.train.get_or_create_global_step()) logging.info("trainning") return model.make_spec(mode, loss=tf.math.reduce_mean(act1_f), train_op=train_op) logging.info("eval") if mode == tf.estimator.ModeKeys.EVAL: model.send('act1_f', act1_f, require_grad=False) fake_loss = tf.reduce_mean(act1_f) return model.make_spec(mode=mode, loss=fake_loss) # mode == tf.estimator.ModeKeys.PREDICT: return model.make_spec(mode=mode, predictions={'act1_f': act1_f})
def zero_hidden_model(X_train, y_train, X_test, y_test, iter_num=2000): tf.reset_default_graph() n_feature = X_train.shape[1] X = tf.placeholder(tf.float32, shape=(None, n_feature)) Y = tf.placeholder(tf.float32, shape=(None)) w1 = tf.get_variable(name='w1', shape=(n_feature, 1), dtype=tf.float32, initializer=tf.keras.initializers.glorot_uniform()) b1 = tf.get_variable(name='b1', shape=(1, 1), dtype=tf.float32, initializer=tf.zeros_initializer()) z = tf.reshape(tf.add(tf.matmul(X, w1), b1), [-1]) loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=z)) opt = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) init = tf.global_variables_initializer() train_acc = 0. test_acc = 0. iter_list = [] cost_list = [] with tf.Session() as sess: sess.run(init) for iter_i in range(iter_num): _, cost = sess.run([opt, loss], feed_dict={X: X_train, Y: y_train}) if iter_i % 10 == 0: iter_list.append(iter_i) cost_list.append(cost) train_predict = np.array(sess.run(z, feed_dict={X: X_train}) > 0, dtype=int) test_predict = np.array(sess.run(z, feed_dict={X: X_test}) > 0, dtype=int) train_acc = np.sum(train_predict == y_train) / len(train_predict) test_acc = np.sum(test_predict == y_test) / len(test_predict) return (iter_list, cost_list), (train_acc, test_acc)
def _target_network(self, obs): """Implements the random target network used by RND.""" with slim.arg_scope( [slim.conv2d, slim.fully_connected], trainable=False, weights_initializer=tf.orthogonal_initializer(gain=np.sqrt(2)), biases_initializer=tf.zeros_initializer()): net = slim.conv2d(obs, 32, [8, 8], stride=4, activation_fn=tf.nn.leaky_relu) net = slim.conv2d(net, 64, [4, 4], stride=2, activation_fn=tf.nn.leaky_relu) net = slim.conv2d(net, 64, [3, 3], stride=1, activation_fn=tf.nn.leaky_relu) net = slim.flatten(net) embedding = slim.fully_connected(net, self.embedding_size, activation_fn=None) return embedding
def stage_1(lr, inputs, labels): # Gen counter to keep track of last-iteration for dense-gradient computation with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True): itr_counter = tf.get_variable("iterations", shape=[], dtype=tf.int32, trainable=False, initializer=tf.zeros_initializer()) inc = tf.assign_add(itr_counter, 1) mod_itrs = tf.math.floormod(inc, iterations_per_dense_grad) last_itr = tf.equal(mod_itrs, 0) fc1 = fc_layers['fc1'] relu1 = fc1(inputs, dense_grad_enabled and last_itr) # Use the IPU optimised version of dropout: if training: drop1 = rand_ops.dropout(relu1, rate=droprate) else: drop1 = relu1 return lr, labels, drop1, last_itr
def create_visualencoder(self, x): with tf.variable_scope("visualencoder", reuse=tf.AUTO_REUSE) as vs: if self.settings["pad_visuals"]: x = self.apply_visual_pad(x) for n in range(self.settings['visualencoder_n_convs']): y = tf.layers.conv2d( x, self.settings["visualencoder_n_filters"][n], self.settings["visualencoder_filter_sizes"][n], name='visualencoder_layer{}'.format(n), padding='same', activation=tf.nn.elu, kernel_initializer=tf.keras.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer(), ) if n in self.settings[ "visualencoder_peepholes"] and self.settings[ "peephole_convs"]: x = tf.concat([y, x], axis=-1) else: x = y if n in self.settings["visualencoder_poolings"]: y = tf.layers.max_pooling2d(y, 2, 2, padding='same') return x
def mlm_weight(config, sequence_output, embedding_table, scope='cls/predictions'): with tf.variable_scope(scope): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( sequence_output, units=config.embedding_size, activation=get_activation(config.hidden_act), kernel_initializer=create_initializer( config.initializer_range)) input_tensor = layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, embedding_table, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def scale_gaussian_prior(name, z, logscale_factor=3.0, trainable=True): """Returns N(s^i * z^i, std^i) where s^i and std^i are pre-component. s^i is a learnable parameter with identity initialization. std^i is optionally learnable with identity initialization. Args: name: variable scope. z: input_tensor logscale_factor: equivalent to scaling up the learning_rate by a factor of logscale_factor. trainable: Whether or not std^i is learnt. """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): z_shape = common_layers.shape_list(z) latent_multiplier = tf.get_variable( "latent_multiplier", shape=z_shape, dtype=tf.float32, initializer=tf.ones_initializer()) log_scale = tf.get_variable( "log_scale_latent", shape=z_shape, dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=trainable) log_scale = log_scale * logscale_factor return tfp.distributions.Normal( loc=latent_multiplier * z, scale=tf.exp(log_scale))
def get_logits(bert_config, input_tensor, output_weights, positions): """Get logits for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) if bert_config.hidden_size != bert_config.embedding_size: extra_output_weights = tf.get_variable( name="extra_output_weights", shape=[ bert_config.vocab_size, bert_config.hidden_size - bert_config.embedding_size ], initializer=modeling.create_initializer( bert_config.initializer_range)) output_weights = tf.concat([output_weights, extra_output_weights], axis=1) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def build_controller(self): """Create the RNN and output projections for controlling the stack. """ with tf.name_scope("controller"): self.rnn = contrib.rnn().BasicRNNCell(self._num_units) self._input_proj = self.add_variable( "input_projection_weights", shape=[ self._embedding_size * (self._num_read_heads + 1), self._num_units ], dtype=self.dtype) self._input_bias = self.add_variable( "input_projection_bias", shape=[self._num_units], initializer=tf.zeros_initializer(dtype=self.dtype)) self._push_proj, self._push_bias = self.add_scalar_projection( "push", self._num_write_heads) self._pop_proj, self._pop_bias = self.add_scalar_projection( "pop", self._num_write_heads) self._value_proj, self._value_bias = self.add_vector_projection( "value", self._num_write_heads) self._output_proj, self._output_bias = self.add_vector_projection( "output", 1)
def layer_norm(inputs, scope='ln'): '''Applies layer normalization. See https://arxiv.org/abs/1607.06450. inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. Returns: A tensor with the same shape and data dtype as `inputs`. ''' epsilon = 1e-8 with tf.variable_scope(scope): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] # [-1] means last dimension mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer()) gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer()) normalized = (inputs - mean) / ((variance + epsilon)**(.5)) outputs = gamma * normalized + beta return outputs
def __init__(self, reward_range, observation_space, action_space, frame_stack_size, frame_height, frame_width, initial_frame_chooser, batch_size, model_name, model_hparams, model_dir, intrinsic_reward_scale=0.0, sim_video_dir=None): """Batch of environments inside the TensorFlow graph.""" super(SimulatedBatchEnv, self).__init__(observation_space, action_space) self._ffmpeg_works = common_video.ffmpeg_works() self.batch_size = batch_size self._min_reward = reward_range[0] self._num_frames = frame_stack_size self._intrinsic_reward_scale = intrinsic_reward_scale self._episode_counter = tf.get_variable("episode_counter", initializer=tf.zeros( (), dtype=tf.int32), trainable=False, dtype=tf.int32) if sim_video_dir: self._video_every_epochs = 100 self._video_dir = sim_video_dir self._video_writer = None self._video_counter = 0 tf.gfile.MakeDirs(self._video_dir) self._video_condition = tf.equal( self._episode_counter.read_value() % self._video_every_epochs, 0) else: self._video_condition = tf.constant(False, dtype=tf.bool, shape=()) model_hparams = copy.copy(model_hparams) problem = DummyWorldModelProblem(action_space, reward_range, frame_height, frame_width) trainer_lib.add_problem_hparams(model_hparams, problem) model_hparams.force_full_predict = True self._model = registry.model(model_name)(model_hparams, tf.estimator.ModeKeys.PREDICT) self.history_buffer = HistoryBuffer(initial_frame_chooser, self.observ_shape, self.observ_dtype, self._num_frames, self.batch_size) self._observ = tf.Variable(tf.zeros((batch_size, ) + self.observ_shape, self.observ_dtype), trainable=False) self._reset_model = tf.get_variable("reset_model", [], trainable=False, initializer=tf.zeros_initializer()) self._model_dir = model_dir
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """See base class.""" assignments = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = self._get_variable_name(param.name) m = tf.get_variable(name=six.ensure_str(param_name) + "/m", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) # Note: shape is not passed here explicitly since tf.get_variable # complains when you do that while passing a Tensor as an initializer. prev_w_norm = tf.get_variable( name=six.ensure_str(param_name) + "/prev_w_norm", dtype=tf.float32, trainable=False, initializer=lambda w=param: tf.norm(w.initialized_value(), ord=2)) prev_eta = tf.get_variable(name=six.ensure_str(param_name) + "/prev_eta", shape=[], dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) prev_beta = tf.get_variable(name=six.ensure_str(param_name) + "/prev_beta", shape=[], dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) if self._do_use_weight_decay(param_name): grad += self.weight_decay_rate * param if self.use_adaptive: grad_squared_sum = tf.get_variable( name=six.ensure_str(param_name) + "/grad_squared_sum", shape=[], dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) max_grad = tf.get_variable(name=six.ensure_str(param_name) + "/max_grad", shape=[], dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) iteration = tf.get_variable(name=six.ensure_str(param_name) + "/iteration", shape=[], dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) next_grad_squared_sum = grad_squared_sum + tf.norm(grad, 2) next_iteration = iteration + 1 next_max_grad = tf.maximum(max_grad, tf.norm(grad, 2)) assignments.extend([ grad_squared_sum.assign(next_grad_squared_sum), iteration.assign(next_iteration), max_grad.assign(next_max_grad) ]) # Intuitively we should be able to leave g_sum=next_grad_squared_sum, # but current theory needs this extra t^1/4 max_grad term. g_sum = next_grad_squared_sum + tf.pow(next_iteration, 0.25) * next_max_grad eta = self.learning_rate / tf.pow( tf.pow(next_iteration, 3.0) * tf.pow(g_sum, 2.0), 1.0 / 7.0) a = tf.minimum( 1.0, 1.0 / (next_iteration * tf.pow(eta, 2.0) * g_sum)) beta = 1.0 - a else: eta = self.learning_rate beta = self.beta next_m = (tf.multiply(beta, m) + tf.multiply(1.0 - beta, grad)) ratio = 1.0 w_norm = tf.norm(param, ord=2) if self._do_layer_adaptation(param_name): g_norm = tf.norm(next_m, ord=2) ratio = self.gamma * tf.where( tf.math.greater(w_norm, 0), tf.where(tf.math.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) normalized_m_with_lr = ratio * eta * next_m if self.use_igt: prev_x = self.compute_x(param_name, param, m, prev_w_norm, prev_eta, prev_beta) next_x = prev_x - normalized_m_with_lr next_param = next_x + tf.divide( tf.multiply(beta, normalized_m_with_lr), beta - 1.0) else: next_param = param - normalized_m_with_lr assignments.extend([ param.assign(next_param), m.assign(next_m), prev_w_norm.assign(w_norm), prev_eta.assign(eta), prev_beta.assign(beta) ]) return tf.group(*assignments, name=name)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, multilabel, sent_rels, sentiment, entailment_rels, entailment, corr_rels, correlation): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids) # Here, we are doing a classification task on the entire segment. For # token-level output, use model.get_sequece_output() instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # with open('Debug_file_1.txt', 'a+') as infile: # print(logits, file=infile) # Labels both for single and multilabel classification labels = tf.cast(labels, tf.float32) if multilabel: probabilities = tf.nn.sigmoid(logits) tf.logging.info("num_labels:{};logits:{};labels:{}".format( num_labels, logits, labels)) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) else: probabilities = tf.nn.softmax(logits, axis=-1) per_example_loss = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss = tf.reduce_mean(per_example_loss) # Add regularization based on label relations prior probs_exp = tf.expand_dims(probabilities, 1) m = tf.tile(probs_exp, [1, num_labels, 1]) probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1]) # Subtract each prediction from all others: # Example (with batch size=1): # tiled predictions: [0.1] [0.1] [0.1] # [0.2] [0.2] [0.2] # [0.3] [0.3] [0.3] # subtract [0.1, 0.2, 0.3] row-wise # result: [0.0] [-.1] [-.2] --> row represents difference between # emotion 1 and all other emotions # [0.1] [0.0] [-.1] # [0.2] [0.1] [0.0] dists = tf.square(tf.subtract(m, probs_exp_t)) # square distances dists = tf.transpose(dists, perm=[0, 2, 1]) # Sentiment-based regularization sent_reg = tf.multiply( tf.constant(sentiment), tf.reduce_mean( tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32)))) tf.summary.scalar("sentiment_regularization", sent_reg) loss += sent_reg # Entailment-based regularization ent_reg = tf.multiply( tf.constant(entailment), tf.reduce_mean( tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32)))) tf.summary.scalar("entailment_regularization", ent_reg) loss += ent_reg # Correlation-based regularization corr_reg = tf.multiply( tf.constant(correlation), tf.reduce_mean( tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32)))) tf.summary.scalar("correlation_regularization", corr_reg) loss += corr_reg tf.summary.scalar("loss", loss) return (loss, per_example_loss, output_layer, logits, probabilities)
def define_vars(self) -> dict: return { "conv1_weights": tf.get_variable( name="conv1_weights", dtype=tf.float32, shape=[3, 3, NUM_CHANNELS, 32], initializer=tf.glorot_uniform_initializer(), ), "conv1_biases": tf.get_variable( name="conv1_biases", dtype=tf.float32, shape=[32], initializer=tf.zeros_initializer(), ), "conv2_weights": tf.get_variable( name="conv2_weights", dtype=tf.float32, shape=[3, 3, 32, 32], initializer=tf.glorot_uniform_initializer(), ), "conv2_biases": tf.get_variable( name="conv2_biases", dtype=tf.float32, shape=[32], initializer=tf.zeros_initializer(), ), "conv3_weights": tf.get_variable( name="conv3_weights", dtype=tf.float32, shape=[3, 3, 32, 64], initializer=tf.glorot_uniform_initializer(), ), "conv3_biases": tf.get_variable( name="conv3_biases", dtype=tf.float32, shape=[64], initializer=tf.zeros_initializer(), ), "conv4_weights": tf.get_variable( name="conv4_weights", dtype=tf.float32, shape=[3, 3, 64, 64], initializer=tf.glorot_uniform_initializer(), ), "conv4_biases": tf.get_variable( name="conv4_biases", dtype=tf.float32, shape=[64], initializer=tf.zeros_initializer(), ), "fc1_weights": tf.get_variable( name="fc1_weights", dtype=tf.float32, shape=[(((IMAGE_SIZE - 2) // 2 - 2) // 2)**2 * 64, 512], initializer=tf.glorot_uniform_initializer(), ), "fc1_biases": tf.get_variable(name="fc1_biases", dtype=tf.float32, shape=[512], initializer=tf.zeros_initializer()), "fc2_weights": tf.get_variable( name="fc2_weights", dtype=tf.float32, shape=[512, NUM_CLASSES], initializer=tf.glorot_uniform_initializer(), ), "fc2_biases": tf.get_variable( name="fc2_biases", dtype=tf.float32, shape=[NUM_CLASSES], initializer=tf.zeros_initializer(), ), }
def build(self, _): self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size], initializer=tf.ones_initializer()) self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size], initializer=tf.zeros_initializer()) self.built = True
def XceptionModel(input_image, num_classes, is_training = False, data_format='channels_last', name_prefix='', use_bn=True): bn_axis = -1 if data_format == 'channels_last' else 1 # Entry Flow inputs = tf.layers.conv2d(input_image, 32, (3, 3), use_bias=False, name=name_prefix+'block1_conv1', strides=(2, 2), padding='valid', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) inputs = batch_norm_(inputs, name=name_prefix+'block1_conv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block1_conv1_bn', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = tf.nn.relu(inputs, name=name_prefix+'block1_conv1_act') inputs = tf.layers.conv2d(inputs, 64, (3, 3), use_bias=False, name=name_prefix+'block1_conv2', strides=(1, 1), padding='valid', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) inputs = batch_norm_(inputs, name=name_prefix+'block1_conv2_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block1_conv2_bn', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = tf.nn.relu(inputs, name=name_prefix+'block1_conv2_act') residual = tf.layers.conv2d(inputs, 128, (1, 1), use_bias=False, name=name_prefix+'conv2d_1', strides=(2, 2), padding='same', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) residual = batch_norm_(residual, name=name_prefix+'batch_normalization_1', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_1', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = tf.layers.separable_conv2d(inputs, 128, (3, 3), strides=(1, 1), padding='same', data_format=data_format, activation=None, use_bias=False, depthwise_initializer=tf.initializers.glorot_uniform(), pointwise_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer(), name=name_prefix+'block2_sepconv1', reuse=None) inputs = batch_norm_(inputs, name=name_prefix+'block1_sepconv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block2_sepconv1_bn', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block2_sepconv2', is_training, data_format, use_bn=use_bn) inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2), padding='same', data_format=data_format, name=name_prefix+'block2_pool') inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_0') residual = tf.layers.conv2d(inputs, 128, (1, 1), use_bias=False, name=name_prefix+'conv2d_2', strides=(2, 2), padding='same', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) residual = batch_norm_(residual, name=name_prefix+'batch_normalization_2', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_2', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block3_sepconv1', is_training, data_format, use_bn=use_bn) inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block3_sepconv2', is_training, data_format, use_bn=use_bn) inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2), padding='same', data_format=data_format, name=name_prefix+'block3_pool') inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_1') residual = tf.layers.conv2d(inputs, 256, (1, 1), use_bias=False, name=name_prefix+'conv2d_3', strides=(2, 2), padding='same', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) residual = batch_norm_(residual, name=name_prefix+'batch_normalization_3', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_3', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = relu_separable_bn_block(inputs, 256, name_prefix+'block4_sepconv1', is_training, data_format, use_bn=use_bn) inputs = relu_separable_bn_block(inputs, 256, name_prefix+'block4_sepconv2', is_training, data_format, use_bn=use_bn) inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2), padding='same', data_format=data_format, name=name_prefix+'block4_pool') inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_2') # Middle Flow for index in range(8): residual = inputs prefix = name_prefix+'block' + str(index + 5) inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv1', is_training, data_format, use_bn=use_bn) inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv2', is_training, data_format, use_bn=use_bn) inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv3', is_training, data_format, use_bn=use_bn) inputs = tf.add(inputs, residual, name=prefix + '_residual_add') # Exit Flow residual = tf.layers.conv2d(inputs, 512, (1, 1), use_bias=False, name=name_prefix+'conv2d_4', strides=(2, 2), padding='same', data_format=data_format, activation=None, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer()) residual = batch_norm_(residual, name=name_prefix+'batch_normalization_4', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_4', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = relu_separable_bn_block(inputs, 512, name_prefix+'block13_sepconv1', is_training, data_format, use_bn=use_bn) inputs = relu_separable_bn_block(inputs, 512, name_prefix+'block13_sepconv2', is_training, data_format, use_bn=use_bn) inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2), padding='same', data_format=data_format, name=name_prefix+'block13_pool') inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_3') inputs = tf.layers.separable_conv2d(inputs, 728, (3, 3), strides=(1, 1), padding='same', data_format=data_format, activation=None, use_bias=False, depthwise_initializer=tf.initializers.glorot_uniform(), pointwise_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer(), name=name_prefix+'block14_sepconv1', reuse=None) inputs = batch_norm_(inputs, name=name_prefix+'block14_sepconv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block14_sepconv1_bn', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = tf.nn.relu(inputs, name=name_prefix+'block14_sepconv1_act') inputs = tf.layers.separable_conv2d(inputs, 728, (3, 3), strides=(1, 1), padding='same', data_format=data_format, activation=None, use_bias=False, depthwise_initializer=tf.initializers.glorot_uniform(), pointwise_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer(), name=name_prefix+'block14_sepconv2', reuse=None) inputs = batch_norm_(inputs, name=name_prefix+'block14_sepconv2_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn) # inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block14_sepconv2_bn', axis=bn_axis, # epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN) inputs = tf.nn.relu(inputs, name=name_prefix+'block14_sepconv2_act') if data_format == 'channels_first': channels_last_inputs = tf.transpose(inputs, [0, 2, 3, 1]) else: channels_last_inputs = inputs inputs = tf.layers.average_pooling2d(inputs, pool_size = reduced_kernel_size_for_small_input(channels_last_inputs, [10, 10]), strides = 1, padding='valid', data_format=data_format, name=name_prefix+'avg_pool') if data_format == 'channels_first': inputs = tf.squeeze(inputs, axis=[2, 3]) else: inputs = tf.squeeze(inputs, axis=[1, 2]) outputs = tf.layers.dense(inputs, num_classes, activation=tf.nn.softmax, use_bias=True, kernel_initializer=tf.initializers.glorot_uniform(), bias_initializer=tf.zeros_initializer(), name=name_prefix+'dense', reuse=None) return outputs
def conv2d( x, kernel_size, stride, channels, is_training, scope='conv2d', batch_norm=False, residual=False, gated=False, activation_fn=tf.nn.relu, resize=False, transpose=False, stacked_layers=1, ): """2D-Conv with optional batch_norm, gating, residual. Args: x: Tensor input [MB, H, W, CH]. kernel_size: List [H, W]. stride: List [H, W]. channels: Int, output channels. is_training: Whether to collect stats for BatchNorm. scope: Enclosing scope name. batch_norm: Apply batch normalization residual: Residual connections, have stacked_layers >= 2. gated: Gating ala Wavenet. activation_fn: Nonlinearity function. resize: On transposed convolution, do ImageResize instead of conv_transpose. transpose: Use conv_transpose instead of conv. stacked_layers: Number of layers before a residual connection. Returns: x: Tensor output. """ # For residual x0 = x # Choose convolution function conv_fn = slim.conv2d_transpose if transpose else slim.conv2d # Double output channels for gates num_outputs = channels * 2 if gated else channels normalizer_fn = slim.batch_norm if batch_norm else None with tf.variable_scope(scope + '_Layer'): # Apply a stack of convolutions Before adding residual for layer_idx in range(stacked_layers): with slim.arg_scope( slim_batchnorm_arg_scope(is_training, activation_fn=None)): # Use interpolation to upsample instead of conv_transpose if transpose and resize: unused_mb, h, w, unused_ch = x.get_shape().as_list() x = tf.image.resize_images( x, size=[h * stride[0], w * stride[1]], method=0) stride_conv = [1, 1] else: stride_conv = stride x = conv_fn( inputs=x, stride=stride_conv, kernel_size=kernel_size, num_outputs=num_outputs, normalizer_fn=normalizer_fn, biases_initializer=tf.zeros_initializer(), scope=scope, ) if gated: with tf.variable_scope('Gated'): x1, x2 = x[:, :, :, :channels], x[:, :, :, channels:] if activation_fn: x1, x2 = activation_fn(x1), tf.sigmoid(x2) else: x2 = tf.sigmoid(x2) x = x1 * x2 # Apply residual to last layer before the last nonlinearity if residual and (layer_idx == stacked_layers - 1): with tf.variable_scope('Residual'): # Don't upsample residual in time if stride[0] == 1 and stride[1] == 1: channels_in = x0.get_shape().as_list()[-1] # Make n_channels match for residual if channels != channels_in: x0 = slim.conv2d( inputs=x0, stride=[1, 1], kernel_size=[1, 1], num_outputs=channels, normalizer_fn=None, activation_fn=None, biases_initializer=tf.zeros_initializer, scope=scope + '_residual', ) x += x0 else: x += x0 if activation_fn and not gated: x = activation_fn(x) return x
def output_module_1(outputs): with tf.variable_scope('om', initializer=output_initializer): # Create the matrix and bias for the final projection into the softmax. if config.share_input_and_output_embeddings: assert config.embed_once, 'Not implemented.' softmax_weights = embedding softmax_weights_transpose = True else: softmax_weights = tf.get_variable( 'weights', [config.output_embedding_size, config.vocab_size], dtype=tf.float32) softmax_weights_transpose = False softmax_bias = tf.get_variable( 'bias', [1, config.vocab_size], initializer=tf.zeros_initializer(), dtype=tf.float32) def to_softmax(x, dropout=self.downprojected_output_dropout): if dropout is not None: if not config.shared_mask_dropout: x = tf.nn.dropout(x, 1.0 - dropout) else: x = tf.reshape(x, t_bk_o) x = tf.nn.dropout( x, 1.0 - dropout, # same mask for all time steps noise_shape=[ 1, batch_size * (config.mos_num_components or 1), config.output_embedding_size ]) x = tf.reshape(x, tbk_o) return (self.softmax_temperature * (tf.matmul(x, softmax_weights, transpose_b=softmax_weights_transpose) + softmax_bias)) last_hidden_size = utils.ensure_list(config.hidden_size)[-1] outputs_t_b_h = tf.convert_to_tensor(outputs) if self.output_dropout is not None: if not config.shared_mask_dropout: outputs_t_b_h = tf.nn.dropout( outputs_t_b_h, 1.0 - self.output_dropout) else: outputs_t_b_h = tf.nn.dropout( outputs_t_b_h, 1.0 - self.output_dropout, noise_shape=[1, batch_size, last_hidden_size]) outputs_tb_h = tf.reshape(outputs_t_b_h, tb_h) if config.mos_num_components == 0: if config.output_embedding_size == last_hidden_size: return (tf.reshape(to_softmax(outputs_tb_h, None), t_b_v), outputs_t_b_h) else: downprojected_outputs_tb_o = utils.linear( outputs_tb_h, config.output_embedding_size, False, initializer=utils.orthogonal_initializer(), scope='projection') logits_tb_v = to_softmax(downprojected_outputs_tb_o) return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h else: logits_tb_v = utils.mixture_of_softmaxes( outputs_tb_h, config.mos_num_components, config.output_embedding_size, to_softmax) return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h
def build_and_train(iterations, log_stride, test=False): """Construct the data, model, loss and optimizer then train.""" # Test mode settings. batch_size = 2 if test else FLAGS.batch_size num_mems = 2 if test else FLAGS.num_mems num_heads = 1 if test else FLAGS.num_mems num_blocks = 1 if test else FLAGS.num_mems head_size = 4 if test else FLAGS.head_size num_objects = 2 if test else FLAGS.num_objects num_features = 4 if test else FLAGS.num_features mlp_size = (20, ) if test else (256, 256, 256, 256) with tf.Graph().as_default(): t0 = time.time() # Initialize the dataset. dataset = dataset_nth_farthest.NthFarthest(batch_size, num_objects, num_features) # Create the model. core = snt.RelationalMemory(mem_slots=num_mems, head_size=head_size, num_heads=num_heads, num_blocks=num_blocks, gate_style=FLAGS.gate_style) final_mlp = snt.nets.MLP(output_sizes=mlp_size, activate_final=True) model = SequenceModel(core=core, target_size=num_objects, final_mlp=final_mlp) tf.logging.info("Instantiated models ({:3f})".format(time.time() - t0)) # Get train and test data. inputs_train, labels_train = dataset.get_batch() inputs_test, labels_test = dataset.get_batch() # Define target accuracy. def compute_accuracy(logits, targets, name="accuracy"): correct_pred = tf.cast( tf.equal(tf.cast(targets, tf.int64), tf.argmax(logits, 1)), tf.float32) return tf.reduce_mean(correct_pred, name=name) # Define the loss & accuracy. def loss_fn(inputs, labels): """Creates the loss and the exports.""" logits = model(inputs) labels = tf.cast(labels, tf.int32) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) accuracy = compute_accuracy(logits, labels) return loss, accuracy # Get training step counter. global_step = tf.get_variable(name="global_step", shape=[], dtype=tf.int64, initializer=tf.zeros_initializer(), trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) # Create the optimizer. learning_rate_op = tf.reduce_max([ tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps=FLAGS.epochs // 100, decay_rate=0.9, staircase=False), FLAGS.min_learning_rate ]) optimizer = tf.train.AdamOptimizer(learning_rate_op) train_loss, _ = loss_fn(inputs_train, labels_train) step_op = optimizer.minimize(train_loss, global_step=global_step) # Compute test accuracy logits_test = model(inputs_test) labels_test = tf.cast(labels_test, tf.int32) test_acc = compute_accuracy(logits_test, labels_test) tf.logging.info( "Created losses and optimizers ({:3f})".format(time.time() - t0)) # Begin Training. t0 = time.time() train_losses = [] steps = [] test_accs = [] tf.logging.info("Starting training ({:3f})".format(time.time() - t0)) with tf.train.SingularMonitoredSession() as sess: for it in six.moves.range(iterations): sess.run([step_op, learning_rate_op]) if it % log_stride == 0: loss_v, acc_v = sess.run([train_loss, test_acc]) elapsed = time.time() - t0 tf.logging.info( "iter: {:2d}, train loss {:3f}; test acc {:3f} ({:3f})" .format(it, loss_v, acc_v, elapsed)) train_losses.append(loss_v) steps.append(it) test_accs.append(acc_v) return steps, train_losses, test_accs
def evonorm(inputs, is_training, layer=LAYER_EVONORM_B0, nonlinearity=True, init_zero=False, decay=MOVING_AVERAGE_DECAY, epsilon=EPSILON, num_groups=32, data_format='channels_first'): """Apply an EvoNorm transformation (an alternative to BN-ReLU). Hanxiao Liu, Andrew Brock, Karen Simonyan, Quoc V. Le. Evolving Normalization-Activation Layers. https://arxiv.org/abs/2004.02967 Args: inputs: `Tensor` whose shape is either `[batch, channels, ...]` with the "channels_first" format or `[batch, height, width, channels]` with the "channels_last" format. is_training: `bool` for whether the model is training. layer: `String` specifies the EvoNorm instantiation. nonlinearity: `bool` if False, apply an affine transform only. init_zero: `bool` if True, initializes scale parameter of batch normalization with 0 instead of 1 (default). decay: `float` a scalar decay used in the moving average. epsilon: `float` a small float added to variance to avoid dividing by zero. num_groups: `int` the number of groups per layer, used only when `layer` == LAYER_EVONORM_S0. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A normalized `Tensor` with the same `data_format`. """ if init_zero: gamma_initializer = tf.zeros_initializer() else: gamma_initializer = tf.ones_initializer() if data_format == 'channels_last': var_shape = (1, 1, 1, inputs.shape[3]) else: var_shape = (1, inputs.shape[1], 1, 1) with tf.variable_scope(None, default_name='evonorm'): beta = tf.get_variable('beta', shape=var_shape, dtype=inputs.dtype, initializer=tf.zeros_initializer()) gamma = tf.get_variable('gamma', shape=var_shape, dtype=inputs.dtype, initializer=gamma_initializer) if nonlinearity: v = tf.get_variable('v', shape=var_shape, dtype=inputs.dtype, initializer=tf.ones_initializer()) if layer == LAYER_EVONORM_S0: den = _group_std(inputs, epsilon=epsilon, data_format=data_format, num_groups=num_groups) inputs = inputs * tf.nn.sigmoid(v * inputs) / den elif layer == LAYER_EVONORM_B0: left = _batch_std(inputs, decay=decay, epsilon=epsilon, data_format=data_format, training=is_training) right = v * inputs + _instance_std( inputs, epsilon=epsilon, data_format=data_format) inputs = inputs / tf.maximum(left, right) else: raise ValueError('Unknown EvoNorm layer: {}'.format(layer)) return inputs * gamma + beta
def model_creation(neurons, nb_features, nb_targets, learning_rate): # Session sess = tf.InteractiveSession() # Placeholders X = tf.placeholder(tf.float32, shape=[None, nb_features]) Y = tf.placeholder(tf.float32, shape=[None, nb_targets]) # Definition on number of neurons and layers if len(neurons) < 1: raise Exception("You must have at least one hidden layer") weight_initializer = tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform", scale=1) bias_initializer = tf.zeros_initializer() layers_dict = {} # # Hidden weight and bias for id in range(len(neurons)): if id == 0: layers_dict["weight_hidden_" + str(id)] = tf.Variable( weight_initializer([nb_features, neurons[id]])) layers_dict["bias_hidden_" + str(id)] = tf.Variable( bias_initializer([neurons[id]])) else: layers_dict["weight_hidden_" + str(id)] = tf.Variable( weight_initializer([neurons[id - 1], neurons[id]])) layers_dict["bias_hidden_" + str(id)] = tf.Variable( bias_initializer([neurons[id]])) # Out layers and bias layers_dict["weight_out"] = tf.Variable( weight_initializer([neurons[-1], nb_targets])) layers_dict["bias_out"] = tf.Variable(bias_initializer([nb_targets])) # Hidden layers for id in range(len(neurons)): if id == 0: layers_dict["hidden_layer_" + str(id)] = tf.sigmoid( tf.add(tf.matmul(X, layers_dict["weight_hidden_" + str(id)]), layers_dict["bias_hidden_" + str(id)])) else: layers_dict["hidden_layer_" + str(id)] = tf.sigmoid( tf.add( tf.matmul(layers_dict["hidden_layer_" + str(id - 1)], layers_dict["weight_hidden_" + str(id)]), layers_dict["bias_hidden_" + str(id)])) # Output layer layers_dict["output_layer"] = tf.abs( tf.transpose( tf.add( tf.matmul(layers_dict["hidden_layer_" + str(len(neurons) - 1)], layers_dict["weight_out"]), layers_dict["bias_out"]))) #Cost_function mse = tf.sqrt( tf.reduce_mean(tf.squared_difference(layers_dict["output_layer"], Y))) # Optimizer opt = tf.train.AdamOptimizer(learning_rate).minimize(mse) # Init sess.run(tf.global_variables_initializer()) return ((X, Y, sess, opt, mse, layers_dict))