Пример #1
0
def get_sentence_order_output(albert_config, input_tensor, labels):
    """Get loss and log probs for the next sentence prediction."""

    # Simple binary classification. Note that 0 is "next sentence" and 1 is
    # "random sentence". This weight matrix is not used after pre-training.
    with tf.variable_scope("cls/seq_relationship"):
        output_weights = tf.get_variable(
            "output_weights",
            shape=[2, albert_config.hidden_size],
            initializer=modeling.create_initializer(
                albert_config.initializer_range))
        output_bias = tf.get_variable("output_bias",
                                      shape=[2],
                                      initializer=tf.zeros_initializer())

        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, log_probs)
Пример #2
0
  def binary_logits(self, hidden, scope="binary", reuse=False):
    """Compute per-element bianry classification logits."""
    net_config = self.net_config
    initializer = self.get_initializer()
    with tf.variable_scope("{}_proj".format(scope), reuse=reuse):
      hidden = ops.dense(
          hidden,
          net_config.d_model,
          activation=ops.get_activation("gelu"),
          initializer=initializer)

    with tf.variable_scope("{}_loss".format(scope), reuse=reuse):
      binary_w = tf.get_variable("weight", [net_config.d_model],
                                 dtype=hidden.dtype, initializer=initializer)

      binary_b = tf.get_variable("bias", [1], dtype=hidden.dtype,
                                 initializer=tf.zeros_initializer())

      logits = tf.einsum("bid,d->bi", hidden, binary_w) + binary_b
      if logits.dtype != tf.float32:
        # Always use float32 for loss
        logits = tf.cast(logits, tf.float32)
    return logits
Пример #3
0
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Пример #4
0
    def _create_user_terms(self, users, N):
        num_users = self.num_users
        num_items = self.num_items
        num_factors = self.num_factors

        p_u, b_u = super(SVDPP, self)._create_user_terms(users)

        with tf.variable_scope('user'):
            implicit_feedback_embeddings = tf.get_variable(
                name='implict_feedback_embedding',
                shape=[num_items, num_factors],
                initializer=tf.zeros_initializer(),
                regularizer=tf.contrib.layers.l2_regularizer(self.reg_y_u))

            y_u = tf.gather(tf.nn.embedding_lookup_sparse(
                implicit_feedback_embeddings,
                N,
                sp_weights=None,
                combiner='sqrtn'),
                            users,
                            name='y_u')

        return p_u, b_u, y_u
Пример #5
0
 def _build_tiled_linear(self, inputs, input_name_and_sizes,
                         output_name_and_sizes, add_bias):
     results = []
     for output_name, output_size in output_name_and_sizes:
         r = 0.0
         for input_, (input_name,
                      input_size) in zip(inputs, input_name_and_sizes):
             name = 'W_{}_{}'.format(input_name, output_name)
             weight = self._get_variable(name,
                                         shape=[output_size, input_size])
             r += tf.sparse_tensor_dense_matmul(weight,
                                                input_,
                                                adjoint_b=True)
         r = tf.transpose(r)
         if add_bias:
             # Biases are dense, hence we call _get_variable of the base
             # class.
             r += super(SparseTiledLinear, self)._get_variable(
                 'B_{}'.format(output_name),
                 shape=[output_size],
                 default_initializer=tf.zeros_initializer())
         results.append(r)
     return results
def get_data_and_params():
  """Set up input dataset and variables."""
  (train_x, train_y), _ = tf.keras.datasets.mnist.load_data()
  tf.set_random_seed(0)
  hparams = contrib_training.HParams(
      batch_size=200,
      learning_rate=0.1,
      train_steps=101,
  )
  dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
  dataset = dataset.repeat()
  dataset = dataset.shuffle(hparams.batch_size * 10)
  dataset = dataset.batch(hparams.batch_size)

  def reshape_ex(x, y):
    return (tf.to_float(tf.reshape(x, (-1, 28 * 28))) / 256.0,
            tf.one_hot(tf.squeeze(y), 10))

  dataset = dataset.map(reshape_ex)
  w = tf.get_variable('w0', (28 * 28, 10))
  b = tf.get_variable('b0', (10,), initializer=tf.zeros_initializer())
  opt = tf.train.GradientDescentOptimizer(hparams.learning_rate)
  return dataset, opt, hparams, w, b
Пример #7
0
def conv(batch_input,
         out_channels,
         stride,
         filterSize=4,
         initScale=0.02,
         useXavier=False,
         paddingSize=1,
         useBias=False):
    with tf.variable_scope("conv"):
        in_height, in_width, in_channels = [
            batch_input.get_shape()[1],
            batch_input.get_shape()[2],
            int(batch_input.get_shape()[-1])
        ]
        filter = tf.get_variable(
            "filter", [filterSize, filterSize, in_channels, out_channels],
            dtype=tf.float32,
            initializer=tf.random_normal_initializer(
                0,
                np.sqrt(2.0 / (int(in_channels) + int(out_channels))) *
                initScale) if useXavier else tf.random_normal_initializer(
                    0, initScale))

        padded_input = tf.pad(batch_input,
                              [[0, 0], [paddingSize, paddingSize],
                               [paddingSize, paddingSize], [0, 0]],
                              mode="CONSTANT")  #SYMMETRIC
        conv = tf.nn.conv2d(padded_input,
                            filter, [1, stride, stride, 1],
                            padding="VALID")

        if useBias:
            offset = tf.get_variable("offset", [1, 1, 1, out_channels],
                                     dtype=tf.float32,
                                     initializer=tf.zeros_initializer())
            conv = conv + offset
        return conv
Пример #8
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Пример #9
0
 def __init__(self,
              filters,
              kernel_size,
              strides=(1, 1),
              padding='same',
              data_format='channels_last',
              activation=None,
              use_bias=True,
              kernel_initializer=None,
              bias_initializer=tf.zeros_initializer(),
              kernel_regularizer=None,
              bias_regularizer=None,
              activity_regularizer=None,
              kernel_constraint=None,
              bias_constraint=None,
              trainable=True,
              name=None,
              **kwargs):
     _Conv.__init__(self,
                    filters,
                    kernel_size,
                    strides=strides,
                    padding=padding,
                    data_format=data_format,
                    activation=activation,
                    use_bias=use_bias,
                    kernel_initializer=kernel_initializer,
                    bias_initializer=bias_initializer,
                    kernel_regularizer=kernel_regularizer,
                    bias_regularizer=bias_regularizer,
                    activity_regularizer=activity_regularizer,
                    kernel_constraint=kernel_constraint,
                    bias_constraint=bias_constraint,
                    trainable=trainable,
                    name=name,
                    **kwargs)
     self.neuron_scale = _get_neuron_scale(self.filters, self.kernel_size)
Пример #10
0
def model_fn(model, features, labels, mode):
    x = features['x']
    print(features, labels, mode)

    w1f = tf.get_variable('w1f',
                          shape=[28 * 28 / 2, 128],
                          dtype=tf.float32,
                          initializer=tf.random_uniform_initializer(
                              -0.01, 0.01))
    b1f = tf.get_variable('b1f',
                          shape=[128],
                          dtype=tf.float32,
                          initializer=tf.zeros_initializer())

    act1_f = tf.nn.relu(tf.nn.bias_add(tf.matmul(x, w1f), b1f))

    if mode == tf.estimator.ModeKeys.TRAIN:
        gact1_f = model.send('act1_f', act1_f, require_grad=True)
        optimizer = tf.train.GradientDescentOptimizer(0.1)
        train_op = model.minimize(
            optimizer,
            act1_f,
            grad_loss=gact1_f,
            global_step=tf.train.get_or_create_global_step())
        logging.info("trainning")
        return model.make_spec(mode,
                               loss=tf.math.reduce_mean(act1_f),
                               train_op=train_op)

    logging.info("eval")
    if mode == tf.estimator.ModeKeys.EVAL:
        model.send('act1_f', act1_f, require_grad=False)
        fake_loss = tf.reduce_mean(act1_f)
        return model.make_spec(mode=mode, loss=fake_loss)

    # mode == tf.estimator.ModeKeys.PREDICT:
    return model.make_spec(mode=mode, predictions={'act1_f': act1_f})
Пример #11
0
def zero_hidden_model(X_train, y_train, X_test, y_test, iter_num=2000):
    tf.reset_default_graph()
    n_feature = X_train.shape[1]
    X = tf.placeholder(tf.float32, shape=(None, n_feature))
    Y = tf.placeholder(tf.float32, shape=(None))
    w1 = tf.get_variable(name='w1',
                         shape=(n_feature, 1),
                         dtype=tf.float32,
                         initializer=tf.keras.initializers.glorot_uniform())
    b1 = tf.get_variable(name='b1',
                         shape=(1, 1),
                         dtype=tf.float32,
                         initializer=tf.zeros_initializer())
    z = tf.reshape(tf.add(tf.matmul(X, w1), b1), [-1])
    loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=z))
    opt = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    init = tf.global_variables_initializer()
    train_acc = 0.
    test_acc = 0.
    iter_list = []
    cost_list = []
    with tf.Session() as sess:
        sess.run(init)
        for iter_i in range(iter_num):
            _, cost = sess.run([opt, loss], feed_dict={X: X_train, Y: y_train})
            if iter_i % 10 == 0:
                iter_list.append(iter_i)
                cost_list.append(cost)
        train_predict = np.array(sess.run(z, feed_dict={X: X_train}) > 0,
                                 dtype=int)
        test_predict = np.array(sess.run(z, feed_dict={X: X_test}) > 0,
                                dtype=int)
        train_acc = np.sum(train_predict == y_train) / len(train_predict)
        test_acc = np.sum(test_predict == y_test) / len(test_predict)
    return (iter_list, cost_list), (train_acc, test_acc)
Пример #12
0
 def _target_network(self, obs):
     """Implements the random target network used by RND."""
     with slim.arg_scope(
         [slim.conv2d, slim.fully_connected],
             trainable=False,
             weights_initializer=tf.orthogonal_initializer(gain=np.sqrt(2)),
             biases_initializer=tf.zeros_initializer()):
         net = slim.conv2d(obs,
                           32, [8, 8],
                           stride=4,
                           activation_fn=tf.nn.leaky_relu)
         net = slim.conv2d(net,
                           64, [4, 4],
                           stride=2,
                           activation_fn=tf.nn.leaky_relu)
         net = slim.conv2d(net,
                           64, [3, 3],
                           stride=1,
                           activation_fn=tf.nn.leaky_relu)
         net = slim.flatten(net)
         embedding = slim.fully_connected(net,
                                          self.embedding_size,
                                          activation_fn=None)
     return embedding
Пример #13
0
    def stage_1(lr, inputs, labels):
        # Gen counter to keep track of last-iteration for dense-gradient computation
        with tf.variable_scope("counter",
                               reuse=tf.AUTO_REUSE,
                               use_resource=True):
            itr_counter = tf.get_variable("iterations",
                                          shape=[],
                                          dtype=tf.int32,
                                          trainable=False,
                                          initializer=tf.zeros_initializer())
            inc = tf.assign_add(itr_counter, 1)
            mod_itrs = tf.math.floormod(inc, iterations_per_dense_grad)
            last_itr = tf.equal(mod_itrs, 0)

        fc1 = fc_layers['fc1']
        relu1 = fc1(inputs, dense_grad_enabled and last_itr)

        # Use the IPU optimised version of dropout:
        if training:
            drop1 = rand_ops.dropout(relu1, rate=droprate)
        else:
            drop1 = relu1

        return lr, labels, drop1, last_itr
Пример #14
0
 def create_visualencoder(self, x):
     with tf.variable_scope("visualencoder", reuse=tf.AUTO_REUSE) as vs:
         if self.settings["pad_visuals"]:
             x = self.apply_visual_pad(x)
         for n in range(self.settings['visualencoder_n_convs']):
             y = tf.layers.conv2d(
                 x,
                 self.settings["visualencoder_n_filters"][n],
                 self.settings["visualencoder_filter_sizes"][n],
                 name='visualencoder_layer{}'.format(n),
                 padding='same',
                 activation=tf.nn.elu,
                 kernel_initializer=tf.keras.initializers.glorot_uniform(),
                 bias_initializer=tf.zeros_initializer(),
             )
             if n in self.settings[
                     "visualencoder_peepholes"] and self.settings[
                         "peephole_convs"]:
                 x = tf.concat([y, x], axis=-1)
             else:
                 x = y
             if n in self.settings["visualencoder_poolings"]:
                 y = tf.layers.max_pooling2d(y, 2, 2, padding='same')
     return x
Пример #15
0
def mlm_weight(config,
               sequence_output,
               embedding_table,
               scope='cls/predictions'):
    with tf.variable_scope(scope):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                sequence_output,
                units=config.embedding_size,
                activation=get_activation(config.hidden_act),
                kernel_initializer=create_initializer(
                    config.initializer_range))
            input_tensor = layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, embedding_table, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Пример #16
0
def scale_gaussian_prior(name, z, logscale_factor=3.0, trainable=True):
  """Returns N(s^i * z^i, std^i) where s^i and std^i are pre-component.

  s^i is a learnable parameter with identity initialization.
  std^i is optionally learnable with identity initialization.

  Args:
    name: variable scope.
    z: input_tensor
    logscale_factor: equivalent to scaling up the learning_rate by a factor
                     of logscale_factor.
    trainable: Whether or not std^i is learnt.
  """
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    z_shape = common_layers.shape_list(z)
    latent_multiplier = tf.get_variable(
        "latent_multiplier", shape=z_shape, dtype=tf.float32,
        initializer=tf.ones_initializer())
    log_scale = tf.get_variable(
        "log_scale_latent", shape=z_shape, dtype=tf.float32,
        initializer=tf.zeros_initializer(), trainable=trainable)
    log_scale = log_scale * logscale_factor
    return tfp.distributions.Normal(
        loc=latent_multiplier * z, scale=tf.exp(log_scale))
Пример #17
0
def get_logits(bert_config, input_tensor, output_weights, positions):
    """Get logits for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())

        if bert_config.hidden_size != bert_config.embedding_size:
            extra_output_weights = tf.get_variable(
                name="extra_output_weights",
                shape=[
                    bert_config.vocab_size,
                    bert_config.hidden_size - bert_config.embedding_size
                ],
                initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            output_weights = tf.concat([output_weights, extra_output_weights],
                                       axis=1)
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        return logits
Пример #18
0
 def build_controller(self):
     """Create the RNN and output projections for controlling the stack.
 """
     with tf.name_scope("controller"):
         self.rnn = contrib.rnn().BasicRNNCell(self._num_units)
         self._input_proj = self.add_variable(
             "input_projection_weights",
             shape=[
                 self._embedding_size * (self._num_read_heads + 1),
                 self._num_units
             ],
             dtype=self.dtype)
         self._input_bias = self.add_variable(
             "input_projection_bias",
             shape=[self._num_units],
             initializer=tf.zeros_initializer(dtype=self.dtype))
         self._push_proj, self._push_bias = self.add_scalar_projection(
             "push", self._num_write_heads)
         self._pop_proj, self._pop_bias = self.add_scalar_projection(
             "pop", self._num_write_heads)
         self._value_proj, self._value_bias = self.add_vector_projection(
             "value", self._num_write_heads)
         self._output_proj, self._output_bias = self.add_vector_projection(
             "output", 1)
Пример #19
0
def layer_norm(inputs, scope='ln'):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.
      
    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    epsilon = 1e-8
    with tf.variable_scope(scope):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
        # [-1] means last dimension
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.get_variable("beta",
                               params_shape,
                               initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma",
                                params_shape,
                                initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ((variance + epsilon)**(.5))
        outputs = gamma * normalized + beta
    return outputs
    def __init__(self,
                 reward_range,
                 observation_space,
                 action_space,
                 frame_stack_size,
                 frame_height,
                 frame_width,
                 initial_frame_chooser,
                 batch_size,
                 model_name,
                 model_hparams,
                 model_dir,
                 intrinsic_reward_scale=0.0,
                 sim_video_dir=None):
        """Batch of environments inside the TensorFlow graph."""
        super(SimulatedBatchEnv, self).__init__(observation_space,
                                                action_space)

        self._ffmpeg_works = common_video.ffmpeg_works()
        self.batch_size = batch_size
        self._min_reward = reward_range[0]
        self._num_frames = frame_stack_size
        self._intrinsic_reward_scale = intrinsic_reward_scale
        self._episode_counter = tf.get_variable("episode_counter",
                                                initializer=tf.zeros(
                                                    (), dtype=tf.int32),
                                                trainable=False,
                                                dtype=tf.int32)
        if sim_video_dir:
            self._video_every_epochs = 100
            self._video_dir = sim_video_dir
            self._video_writer = None
            self._video_counter = 0
            tf.gfile.MakeDirs(self._video_dir)
            self._video_condition = tf.equal(
                self._episode_counter.read_value() % self._video_every_epochs,
                0)
        else:
            self._video_condition = tf.constant(False, dtype=tf.bool, shape=())

        model_hparams = copy.copy(model_hparams)
        problem = DummyWorldModelProblem(action_space, reward_range,
                                         frame_height, frame_width)
        trainer_lib.add_problem_hparams(model_hparams, problem)
        model_hparams.force_full_predict = True
        self._model = registry.model(model_name)(model_hparams,
                                                 tf.estimator.ModeKeys.PREDICT)

        self.history_buffer = HistoryBuffer(initial_frame_chooser,
                                            self.observ_shape,
                                            self.observ_dtype,
                                            self._num_frames, self.batch_size)

        self._observ = tf.Variable(tf.zeros((batch_size, ) + self.observ_shape,
                                            self.observ_dtype),
                                   trainable=False)

        self._reset_model = tf.get_variable("reset_model", [],
                                            trainable=False,
                                            initializer=tf.zeros_initializer())

        self._model_dir = model_dir
Пример #21
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """See base class."""
        assignments = []
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:
                continue

            param_name = self._get_variable_name(param.name)

            m = tf.get_variable(name=six.ensure_str(param_name) + "/m",
                                shape=param.shape.as_list(),
                                dtype=tf.float32,
                                trainable=False,
                                initializer=tf.zeros_initializer())

            # Note: shape is not passed here explicitly since tf.get_variable
            # complains when you do that while passing a Tensor as an initializer.
            prev_w_norm = tf.get_variable(
                name=six.ensure_str(param_name) + "/prev_w_norm",
                dtype=tf.float32,
                trainable=False,
                initializer=lambda w=param: tf.norm(w.initialized_value(),
                                                    ord=2))

            prev_eta = tf.get_variable(name=six.ensure_str(param_name) +
                                       "/prev_eta",
                                       shape=[],
                                       dtype=tf.float32,
                                       trainable=False,
                                       initializer=tf.zeros_initializer())
            prev_beta = tf.get_variable(name=six.ensure_str(param_name) +
                                        "/prev_beta",
                                        shape=[],
                                        dtype=tf.float32,
                                        trainable=False,
                                        initializer=tf.zeros_initializer())

            if self._do_use_weight_decay(param_name):
                grad += self.weight_decay_rate * param

            if self.use_adaptive:
                grad_squared_sum = tf.get_variable(
                    name=six.ensure_str(param_name) + "/grad_squared_sum",
                    shape=[],
                    dtype=tf.float32,
                    trainable=False,
                    initializer=tf.zeros_initializer())

                max_grad = tf.get_variable(name=six.ensure_str(param_name) +
                                           "/max_grad",
                                           shape=[],
                                           dtype=tf.float32,
                                           trainable=False,
                                           initializer=tf.zeros_initializer())

                iteration = tf.get_variable(name=six.ensure_str(param_name) +
                                            "/iteration",
                                            shape=[],
                                            dtype=tf.float32,
                                            trainable=False,
                                            initializer=tf.zeros_initializer())

                next_grad_squared_sum = grad_squared_sum + tf.norm(grad, 2)
                next_iteration = iteration + 1
                next_max_grad = tf.maximum(max_grad, tf.norm(grad, 2))
                assignments.extend([
                    grad_squared_sum.assign(next_grad_squared_sum),
                    iteration.assign(next_iteration),
                    max_grad.assign(next_max_grad)
                ])

                # Intuitively we should be able to leave g_sum=next_grad_squared_sum,
                # but current theory needs this extra t^1/4 max_grad term.
                g_sum = next_grad_squared_sum + tf.pow(next_iteration,
                                                       0.25) * next_max_grad

                eta = self.learning_rate / tf.pow(
                    tf.pow(next_iteration, 3.0) * tf.pow(g_sum, 2.0),
                    1.0 / 7.0)
                a = tf.minimum(
                    1.0, 1.0 / (next_iteration * tf.pow(eta, 2.0) * g_sum))
                beta = 1.0 - a
            else:
                eta = self.learning_rate
                beta = self.beta

            next_m = (tf.multiply(beta, m) + tf.multiply(1.0 - beta, grad))

            ratio = 1.0
            w_norm = tf.norm(param, ord=2)
            if self._do_layer_adaptation(param_name):
                g_norm = tf.norm(next_m, ord=2)
                ratio = self.gamma * tf.where(
                    tf.math.greater(w_norm, 0),
                    tf.where(tf.math.greater(g_norm, 0),
                             (w_norm / g_norm), 1.0), 1.0)
            normalized_m_with_lr = ratio * eta * next_m

            if self.use_igt:
                prev_x = self.compute_x(param_name, param, m, prev_w_norm,
                                        prev_eta, prev_beta)
                next_x = prev_x - normalized_m_with_lr
                next_param = next_x + tf.divide(
                    tf.multiply(beta, normalized_m_with_lr), beta - 1.0)
            else:
                next_param = param - normalized_m_with_lr
            assignments.extend([
                param.assign(next_param),
                m.assign(next_m),
                prev_w_norm.assign(w_norm),
                prev_eta.assign(eta),
                prev_beta.assign(beta)
            ])
        return tf.group(*assignments, name=name)
Пример #22
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, multilabel, sent_rels, sentiment,
                 entailment_rels, entailment, corr_rels, correlation):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids)

  # Here, we are doing a classification task on the entire segment. For
  # token-level output, use model.get_sequece_output() instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    # with open('Debug_file_1.txt', 'a+') as infile:
    # 	print(logits, file=infile)

    # Labels both for single and multilabel classification
    labels = tf.cast(labels, tf.float32)

    if multilabel:
      probabilities = tf.nn.sigmoid(logits)
      tf.logging.info("num_labels:{};logits:{};labels:{}".format(
          num_labels, logits, labels))
      per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
          labels=labels, logits=logits)
    else:
      probabilities = tf.nn.softmax(logits, axis=-1)
      per_example_loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)
    loss = tf.reduce_mean(per_example_loss)

    # Add regularization based on label relations prior
    probs_exp = tf.expand_dims(probabilities, 1)
    m = tf.tile(probs_exp, [1, num_labels, 1])
    probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1])

    # Subtract each prediction from all others:
    # Example (with batch size=1):
    #     tiled predictions: [0.1] [0.1] [0.1]
    #                        [0.2] [0.2] [0.2]
    #                        [0.3] [0.3] [0.3]
    #     subtract [0.1, 0.2, 0.3] row-wise
    #     result:   [0.0] [-.1] [-.2] --> row represents difference between
    #                                     emotion 1 and all other emotions
    #               [0.1] [0.0] [-.1]
    #               [0.2] [0.1] [0.0]
    dists = tf.square(tf.subtract(m, probs_exp_t))  # square distances
    dists = tf.transpose(dists, perm=[0, 2, 1])

    # Sentiment-based regularization
    sent_reg = tf.multiply(
        tf.constant(sentiment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32))))
    tf.summary.scalar("sentiment_regularization", sent_reg)
    loss += sent_reg

    # Entailment-based regularization
    ent_reg = tf.multiply(
        tf.constant(entailment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32))))
    tf.summary.scalar("entailment_regularization", ent_reg)
    loss += ent_reg

    # Correlation-based regularization
    corr_reg = tf.multiply(
        tf.constant(correlation),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32))))
    tf.summary.scalar("correlation_regularization", corr_reg)
    loss += corr_reg

    tf.summary.scalar("loss", loss)

    return (loss, per_example_loss, output_layer, logits, probabilities)
Пример #23
0
 def define_vars(self) -> dict:
     return {
         "conv1_weights":
         tf.get_variable(
             name="conv1_weights",
             dtype=tf.float32,
             shape=[3, 3, NUM_CHANNELS, 32],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "conv1_biases":
         tf.get_variable(
             name="conv1_biases",
             dtype=tf.float32,
             shape=[32],
             initializer=tf.zeros_initializer(),
         ),
         "conv2_weights":
         tf.get_variable(
             name="conv2_weights",
             dtype=tf.float32,
             shape=[3, 3, 32, 32],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "conv2_biases":
         tf.get_variable(
             name="conv2_biases",
             dtype=tf.float32,
             shape=[32],
             initializer=tf.zeros_initializer(),
         ),
         "conv3_weights":
         tf.get_variable(
             name="conv3_weights",
             dtype=tf.float32,
             shape=[3, 3, 32, 64],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "conv3_biases":
         tf.get_variable(
             name="conv3_biases",
             dtype=tf.float32,
             shape=[64],
             initializer=tf.zeros_initializer(),
         ),
         "conv4_weights":
         tf.get_variable(
             name="conv4_weights",
             dtype=tf.float32,
             shape=[3, 3, 64, 64],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "conv4_biases":
         tf.get_variable(
             name="conv4_biases",
             dtype=tf.float32,
             shape=[64],
             initializer=tf.zeros_initializer(),
         ),
         "fc1_weights":
         tf.get_variable(
             name="fc1_weights",
             dtype=tf.float32,
             shape=[(((IMAGE_SIZE - 2) // 2 - 2) // 2)**2 * 64, 512],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "fc1_biases":
         tf.get_variable(name="fc1_biases",
                         dtype=tf.float32,
                         shape=[512],
                         initializer=tf.zeros_initializer()),
         "fc2_weights":
         tf.get_variable(
             name="fc2_weights",
             dtype=tf.float32,
             shape=[512, NUM_CLASSES],
             initializer=tf.glorot_uniform_initializer(),
         ),
         "fc2_biases":
         tf.get_variable(
             name="fc2_biases",
             dtype=tf.float32,
             shape=[NUM_CLASSES],
             initializer=tf.zeros_initializer(),
         ),
     }
Пример #24
0
 def build(self, _):
     self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
                                  initializer=tf.ones_initializer())
     self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
                                 initializer=tf.zeros_initializer())
     self.built = True
Пример #25
0
def XceptionModel(input_image, num_classes, is_training = False, data_format='channels_last', name_prefix='', use_bn=True):
    bn_axis = -1 if data_format == 'channels_last' else 1

    
        
    
    # Entry Flow
    inputs = tf.layers.conv2d(input_image, 32, (3, 3), use_bias=False, name=name_prefix+'block1_conv1', strides=(2, 2),
                padding='valid', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    
    
    inputs =  batch_norm_(inputs, name=name_prefix+'block1_conv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
    
#     inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block1_conv1_bn', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)
    inputs = tf.nn.relu(inputs, name=name_prefix+'block1_conv1_act')

    inputs = tf.layers.conv2d(inputs, 64, (3, 3), use_bias=False, name=name_prefix+'block1_conv2', strides=(1, 1),
                padding='valid', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    
    inputs =  batch_norm_(inputs, name=name_prefix+'block1_conv2_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block1_conv2_bn', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)
    inputs = tf.nn.relu(inputs, name=name_prefix+'block1_conv2_act')

    residual = tf.layers.conv2d(inputs, 128, (1, 1), use_bias=False, name=name_prefix+'conv2d_1', strides=(2, 2),
                padding='same', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    
    residual =  batch_norm_(residual, name=name_prefix+'batch_normalization_1', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_1', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)

    inputs = tf.layers.separable_conv2d(inputs, 128, (3, 3),
                        strides=(1, 1), padding='same',
                        data_format=data_format,
                        activation=None, use_bias=False,
                        depthwise_initializer=tf.initializers.glorot_uniform(),
                        pointwise_initializer=tf.initializers.glorot_uniform(),
                        bias_initializer=tf.zeros_initializer(),
                        name=name_prefix+'block2_sepconv1', reuse=None)
    inputs =  batch_norm_(inputs, name=name_prefix+'block1_sepconv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block2_sepconv1_bn', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)

    inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block2_sepconv2', is_training, data_format, use_bn=use_bn)

    inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2),
                                    padding='same', data_format=data_format,
                                    name=name_prefix+'block2_pool')

    inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_0')
    residual = tf.layers.conv2d(inputs, 128, (1, 1), use_bias=False, name=name_prefix+'conv2d_2', strides=(2, 2),
                padding='same', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    
    residual =  batch_norm_(residual, name=name_prefix+'batch_normalization_2', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_2', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)

    inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block3_sepconv1', is_training, data_format, use_bn=use_bn)
    inputs = relu_separable_bn_block(inputs, 128, name_prefix+'block3_sepconv2', is_training, data_format, use_bn=use_bn)

    inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2),
                                    padding='same', data_format=data_format,
                                    name=name_prefix+'block3_pool')
    inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_1')

    residual = tf.layers.conv2d(inputs, 256, (1, 1), use_bias=False, name=name_prefix+'conv2d_3', strides=(2, 2),
                padding='same', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    residual =  batch_norm_(residual, name=name_prefix+'batch_normalization_3', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_3', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)

    inputs = relu_separable_bn_block(inputs, 256, name_prefix+'block4_sepconv1', is_training, data_format, use_bn=use_bn)
    inputs = relu_separable_bn_block(inputs, 256, name_prefix+'block4_sepconv2', is_training, data_format, use_bn=use_bn)

    inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2),
                                    padding='same', data_format=data_format,
                                    name=name_prefix+'block4_pool')
    inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_2')
    # Middle Flow
    for index in range(8):
        residual = inputs
        prefix = name_prefix+'block' + str(index + 5)

        inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv1', is_training, data_format, use_bn=use_bn)
        inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv2', is_training, data_format, use_bn=use_bn)
        inputs = relu_separable_bn_block(inputs, 256, prefix + '_sepconv3', is_training, data_format, use_bn=use_bn)
        inputs = tf.add(inputs, residual, name=prefix + '_residual_add')
    # Exit Flow
    residual = tf.layers.conv2d(inputs, 512, (1, 1), use_bias=False, name=name_prefix+'conv2d_4', strides=(2, 2),
                padding='same', data_format=data_format, activation=None,
                kernel_initializer=tf.initializers.glorot_uniform(),
                bias_initializer=tf.zeros_initializer())
    residual =  batch_norm_(residual, name=name_prefix+'batch_normalization_4', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     residual = tf.layers.batch_normalization(residual, momentum=BN_MOMENTUM, name=name_prefix+'batch_normalization_4', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)

    inputs = relu_separable_bn_block(inputs, 512, name_prefix+'block13_sepconv1', is_training, data_format, use_bn=use_bn)
    inputs = relu_separable_bn_block(inputs, 512, name_prefix+'block13_sepconv2', is_training, data_format, use_bn=use_bn)

    inputs = tf.layers.max_pooling2d(inputs, pool_size=(3, 3), strides=(2, 2),
                                    padding='same', data_format=data_format,
                                    name=name_prefix+'block13_pool')
    inputs = tf.add(inputs, residual, name=name_prefix+'residual_add_3')

    inputs = tf.layers.separable_conv2d(inputs, 728, (3, 3),
                        strides=(1, 1), padding='same',
                        data_format=data_format,
                        activation=None, use_bias=False,
                        depthwise_initializer=tf.initializers.glorot_uniform(),
                        pointwise_initializer=tf.initializers.glorot_uniform(),
                        bias_initializer=tf.zeros_initializer(),
                        name=name_prefix+'block14_sepconv1', reuse=None)
    inputs =  batch_norm_(inputs, name=name_prefix+'block14_sepconv1_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)
#     inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block14_sepconv1_bn', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)
    inputs = tf.nn.relu(inputs, name=name_prefix+'block14_sepconv1_act')

    inputs = tf.layers.separable_conv2d(inputs, 728, (3, 3),
                        strides=(1, 1), padding='same',
                        data_format=data_format,
                        activation=None, use_bias=False,
                        depthwise_initializer=tf.initializers.glorot_uniform(),
                        pointwise_initializer=tf.initializers.glorot_uniform(),
                        bias_initializer=tf.zeros_initializer(),
                        name=name_prefix+'block14_sepconv2', reuse=None)
    inputs =  batch_norm_(inputs, name=name_prefix+'block14_sepconv2_bn', axis=bn_axis,training=is_training,reuse=None, use_bn=use_bn)

#     inputs = tf.layers.batch_normalization(inputs, momentum=BN_MOMENTUM, name=name_prefix+'block14_sepconv2_bn', axis=bn_axis,
#                             epsilon=BN_EPSILON, training=is_training, reuse=None, fused=USE_FUSED_BN)
    inputs = tf.nn.relu(inputs, name=name_prefix+'block14_sepconv2_act')

    if data_format == 'channels_first':
        channels_last_inputs = tf.transpose(inputs, [0, 2, 3, 1])
    else:
        channels_last_inputs = inputs

    inputs = tf.layers.average_pooling2d(inputs, pool_size = reduced_kernel_size_for_small_input(channels_last_inputs, [10, 10]), strides = 1, padding='valid', data_format=data_format, name=name_prefix+'avg_pool')

    if data_format == 'channels_first':
        inputs = tf.squeeze(inputs, axis=[2, 3])
    else:
        inputs = tf.squeeze(inputs, axis=[1, 2])

    outputs = tf.layers.dense(inputs, num_classes,
                            activation=tf.nn.softmax, use_bias=True,
                            kernel_initializer=tf.initializers.glorot_uniform(),
                            bias_initializer=tf.zeros_initializer(),
                            name=name_prefix+'dense', reuse=None)

    return outputs
Пример #26
0
def conv2d(
    x,
    kernel_size,
    stride,
    channels,
    is_training,
    scope='conv2d',
    batch_norm=False,
    residual=False,
    gated=False,
    activation_fn=tf.nn.relu,
    resize=False,
    transpose=False,
    stacked_layers=1,
):
    """2D-Conv with optional batch_norm, gating, residual.

  Args:
    x: Tensor input [MB, H, W, CH].
    kernel_size: List [H, W].
    stride: List [H, W].
    channels: Int, output channels.
    is_training: Whether to collect stats for BatchNorm.
    scope: Enclosing scope name.
    batch_norm: Apply batch normalization
    residual: Residual connections, have stacked_layers >= 2.
    gated: Gating ala Wavenet.
    activation_fn: Nonlinearity function.
    resize: On transposed convolution, do ImageResize instead of conv_transpose.
    transpose: Use conv_transpose instead of conv.
    stacked_layers: Number of layers before a residual connection.

  Returns:
    x: Tensor output.
  """
    # For residual
    x0 = x
    # Choose convolution function
    conv_fn = slim.conv2d_transpose if transpose else slim.conv2d
    # Double output channels for gates
    num_outputs = channels * 2 if gated else channels
    normalizer_fn = slim.batch_norm if batch_norm else None

    with tf.variable_scope(scope + '_Layer'):
        # Apply a stack of convolutions Before adding residual
        for layer_idx in range(stacked_layers):
            with slim.arg_scope(
                    slim_batchnorm_arg_scope(is_training, activation_fn=None)):
                # Use interpolation to upsample instead of conv_transpose
                if transpose and resize:
                    unused_mb, h, w, unused_ch = x.get_shape().as_list()
                    x = tf.image.resize_images(
                        x, size=[h * stride[0], w * stride[1]], method=0)
                    stride_conv = [1, 1]
                else:
                    stride_conv = stride

                x = conv_fn(
                    inputs=x,
                    stride=stride_conv,
                    kernel_size=kernel_size,
                    num_outputs=num_outputs,
                    normalizer_fn=normalizer_fn,
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope,
                )

                if gated:
                    with tf.variable_scope('Gated'):
                        x1, x2 = x[:, :, :, :channels], x[:, :, :, channels:]
                        if activation_fn:
                            x1, x2 = activation_fn(x1), tf.sigmoid(x2)
                        else:
                            x2 = tf.sigmoid(x2)
                        x = x1 * x2

                # Apply residual to last layer  before the last nonlinearity
                if residual and (layer_idx == stacked_layers - 1):
                    with tf.variable_scope('Residual'):
                        # Don't upsample residual in time
                        if stride[0] == 1 and stride[1] == 1:
                            channels_in = x0.get_shape().as_list()[-1]
                            # Make n_channels match for residual
                            if channels != channels_in:
                                x0 = slim.conv2d(
                                    inputs=x0,
                                    stride=[1, 1],
                                    kernel_size=[1, 1],
                                    num_outputs=channels,
                                    normalizer_fn=None,
                                    activation_fn=None,
                                    biases_initializer=tf.zeros_initializer,
                                    scope=scope + '_residual',
                                )
                                x += x0
                            else:
                                x += x0
                if activation_fn and not gated:
                    x = activation_fn(x)
        return x
Пример #27
0
Файл: lm.py Проект: yyht/lamb
        def output_module_1(outputs):
            with tf.variable_scope('om', initializer=output_initializer):
                # Create the matrix and bias for the final projection into the softmax.
                if config.share_input_and_output_embeddings:
                    assert config.embed_once, 'Not implemented.'
                    softmax_weights = embedding
                    softmax_weights_transpose = True
                else:
                    softmax_weights = tf.get_variable(
                        'weights',
                        [config.output_embedding_size, config.vocab_size],
                        dtype=tf.float32)
                    softmax_weights_transpose = False
                softmax_bias = tf.get_variable(
                    'bias', [1, config.vocab_size],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32)

                def to_softmax(x, dropout=self.downprojected_output_dropout):
                    if dropout is not None:
                        if not config.shared_mask_dropout:
                            x = tf.nn.dropout(x, 1.0 - dropout)
                        else:
                            x = tf.reshape(x, t_bk_o)
                            x = tf.nn.dropout(
                                x,
                                1.0 - dropout,
                                # same mask for all time steps
                                noise_shape=[
                                    1, batch_size *
                                    (config.mos_num_components or 1),
                                    config.output_embedding_size
                                ])
                            x = tf.reshape(x, tbk_o)
                    return (self.softmax_temperature *
                            (tf.matmul(x,
                                       softmax_weights,
                                       transpose_b=softmax_weights_transpose) +
                             softmax_bias))

                last_hidden_size = utils.ensure_list(config.hidden_size)[-1]
                outputs_t_b_h = tf.convert_to_tensor(outputs)
                if self.output_dropout is not None:
                    if not config.shared_mask_dropout:
                        outputs_t_b_h = tf.nn.dropout(
                            outputs_t_b_h, 1.0 - self.output_dropout)
                    else:
                        outputs_t_b_h = tf.nn.dropout(
                            outputs_t_b_h,
                            1.0 - self.output_dropout,
                            noise_shape=[1, batch_size, last_hidden_size])
                outputs_tb_h = tf.reshape(outputs_t_b_h, tb_h)

                if config.mos_num_components == 0:
                    if config.output_embedding_size == last_hidden_size:
                        return (tf.reshape(to_softmax(outputs_tb_h, None),
                                           t_b_v), outputs_t_b_h)
                    else:
                        downprojected_outputs_tb_o = utils.linear(
                            outputs_tb_h,
                            config.output_embedding_size,
                            False,
                            initializer=utils.orthogonal_initializer(),
                            scope='projection')
                        logits_tb_v = to_softmax(downprojected_outputs_tb_o)
                        return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h
                else:
                    logits_tb_v = utils.mixture_of_softmaxes(
                        outputs_tb_h, config.mos_num_components,
                        config.output_embedding_size, to_softmax)
                    return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h
Пример #28
0
def build_and_train(iterations, log_stride, test=False):
    """Construct the data, model, loss and optimizer then train."""

    # Test mode settings.
    batch_size = 2 if test else FLAGS.batch_size
    num_mems = 2 if test else FLAGS.num_mems
    num_heads = 1 if test else FLAGS.num_mems
    num_blocks = 1 if test else FLAGS.num_mems
    head_size = 4 if test else FLAGS.head_size
    num_objects = 2 if test else FLAGS.num_objects
    num_features = 4 if test else FLAGS.num_features
    mlp_size = (20, ) if test else (256, 256, 256, 256)

    with tf.Graph().as_default():
        t0 = time.time()

        # Initialize the dataset.
        dataset = dataset_nth_farthest.NthFarthest(batch_size, num_objects,
                                                   num_features)

        # Create the model.
        core = snt.RelationalMemory(mem_slots=num_mems,
                                    head_size=head_size,
                                    num_heads=num_heads,
                                    num_blocks=num_blocks,
                                    gate_style=FLAGS.gate_style)

        final_mlp = snt.nets.MLP(output_sizes=mlp_size, activate_final=True)

        model = SequenceModel(core=core,
                              target_size=num_objects,
                              final_mlp=final_mlp)

        tf.logging.info("Instantiated models ({:3f})".format(time.time() - t0))

        # Get train and test data.
        inputs_train, labels_train = dataset.get_batch()
        inputs_test, labels_test = dataset.get_batch()

        # Define target accuracy.
        def compute_accuracy(logits, targets, name="accuracy"):
            correct_pred = tf.cast(
                tf.equal(tf.cast(targets, tf.int64), tf.argmax(logits, 1)),
                tf.float32)
            return tf.reduce_mean(correct_pred, name=name)

        # Define the loss & accuracy.
        def loss_fn(inputs, labels):
            """Creates the loss and the exports."""
            logits = model(inputs)
            labels = tf.cast(labels, tf.int32)
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                               labels=labels))
            accuracy = compute_accuracy(logits, labels)
            return loss, accuracy

        # Get training step counter.
        global_step = tf.get_variable(name="global_step",
                                      shape=[],
                                      dtype=tf.int64,
                                      initializer=tf.zeros_initializer(),
                                      trainable=False,
                                      collections=[
                                          tf.GraphKeys.GLOBAL_VARIABLES,
                                          tf.GraphKeys.GLOBAL_STEP
                                      ])

        # Create the optimizer.
        learning_rate_op = tf.reduce_max([
            tf.train.exponential_decay(FLAGS.learning_rate,
                                       global_step,
                                       decay_steps=FLAGS.epochs // 100,
                                       decay_rate=0.9,
                                       staircase=False),
            FLAGS.min_learning_rate
        ])
        optimizer = tf.train.AdamOptimizer(learning_rate_op)
        train_loss, _ = loss_fn(inputs_train, labels_train)
        step_op = optimizer.minimize(train_loss, global_step=global_step)

        # Compute test accuracy
        logits_test = model(inputs_test)
        labels_test = tf.cast(labels_test, tf.int32)
        test_acc = compute_accuracy(logits_test, labels_test)

        tf.logging.info(
            "Created losses and optimizers ({:3f})".format(time.time() - t0))

        # Begin Training.
        t0 = time.time()
        train_losses = []
        steps = []
        test_accs = []
        tf.logging.info("Starting training ({:3f})".format(time.time() - t0))
        with tf.train.SingularMonitoredSession() as sess:
            for it in six.moves.range(iterations):
                sess.run([step_op, learning_rate_op])
                if it % log_stride == 0:
                    loss_v, acc_v = sess.run([train_loss, test_acc])
                    elapsed = time.time() - t0
                    tf.logging.info(
                        "iter: {:2d}, train loss {:3f}; test acc {:3f} ({:3f})"
                        .format(it, loss_v, acc_v, elapsed))
                    train_losses.append(loss_v)
                    steps.append(it)
                test_accs.append(acc_v)
    return steps, train_losses, test_accs
Пример #29
0
def evonorm(inputs,
            is_training,
            layer=LAYER_EVONORM_B0,
            nonlinearity=True,
            init_zero=False,
            decay=MOVING_AVERAGE_DECAY,
            epsilon=EPSILON,
            num_groups=32,
            data_format='channels_first'):
    """Apply an EvoNorm transformation (an alternative to BN-ReLU).

     Hanxiao Liu, Andrew Brock, Karen Simonyan, Quoc V. Le.
     Evolving Normalization-Activation Layers.
     https://arxiv.org/abs/2004.02967

  Args:
    inputs: `Tensor` whose shape is either `[batch, channels, ...]` with
        the "channels_first" format or `[batch, height, width, channels]`
        with the "channels_last" format.
    is_training: `bool` for whether the model is training.
    layer: `String` specifies the EvoNorm instantiation.
    nonlinearity: `bool` if False, apply an affine transform only.
    init_zero: `bool` if True, initializes scale parameter of batch
        normalization with 0 instead of 1 (default).
    decay: `float` a scalar decay used in the moving average.
    epsilon: `float` a small float added to variance to avoid dividing by zero.
    num_groups: `int` the number of groups per layer, used only when `layer` ==
        LAYER_EVONORM_S0.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    A normalized `Tensor` with the same `data_format`.
  """
    if init_zero:
        gamma_initializer = tf.zeros_initializer()
    else:
        gamma_initializer = tf.ones_initializer()

    if data_format == 'channels_last':
        var_shape = (1, 1, 1, inputs.shape[3])
    else:
        var_shape = (1, inputs.shape[1], 1, 1)
    with tf.variable_scope(None, default_name='evonorm'):
        beta = tf.get_variable('beta',
                               shape=var_shape,
                               dtype=inputs.dtype,
                               initializer=tf.zeros_initializer())
        gamma = tf.get_variable('gamma',
                                shape=var_shape,
                                dtype=inputs.dtype,
                                initializer=gamma_initializer)
        if nonlinearity:
            v = tf.get_variable('v',
                                shape=var_shape,
                                dtype=inputs.dtype,
                                initializer=tf.ones_initializer())
            if layer == LAYER_EVONORM_S0:
                den = _group_std(inputs,
                                 epsilon=epsilon,
                                 data_format=data_format,
                                 num_groups=num_groups)
                inputs = inputs * tf.nn.sigmoid(v * inputs) / den
            elif layer == LAYER_EVONORM_B0:
                left = _batch_std(inputs,
                                  decay=decay,
                                  epsilon=epsilon,
                                  data_format=data_format,
                                  training=is_training)
                right = v * inputs + _instance_std(
                    inputs, epsilon=epsilon, data_format=data_format)
                inputs = inputs / tf.maximum(left, right)
            else:
                raise ValueError('Unknown EvoNorm layer: {}'.format(layer))
    return inputs * gamma + beta
Пример #30
0
def model_creation(neurons, nb_features, nb_targets, learning_rate):
    # Session
    sess = tf.InteractiveSession()

    # Placeholders
    X = tf.placeholder(tf.float32, shape=[None, nb_features])
    Y = tf.placeholder(tf.float32, shape=[None, nb_targets])

    # Definition on number of neurons and layers
    if len(neurons) < 1:
        raise Exception("You must have at least one hidden layer")

    weight_initializer = tf.variance_scaling_initializer(
        mode="fan_avg", distribution="uniform", scale=1)
    bias_initializer = tf.zeros_initializer()
    layers_dict = {}  #

    # Hidden weight and bias
    for id in range(len(neurons)):
        if id == 0:
            layers_dict["weight_hidden_" + str(id)] = tf.Variable(
                weight_initializer([nb_features, neurons[id]]))
            layers_dict["bias_hidden_" + str(id)] = tf.Variable(
                bias_initializer([neurons[id]]))
        else:
            layers_dict["weight_hidden_" + str(id)] = tf.Variable(
                weight_initializer([neurons[id - 1], neurons[id]]))
            layers_dict["bias_hidden_" + str(id)] = tf.Variable(
                bias_initializer([neurons[id]]))

    # Out layers and bias
    layers_dict["weight_out"] = tf.Variable(
        weight_initializer([neurons[-1], nb_targets]))
    layers_dict["bias_out"] = tf.Variable(bias_initializer([nb_targets]))

    # Hidden layers
    for id in range(len(neurons)):
        if id == 0:
            layers_dict["hidden_layer_" + str(id)] = tf.sigmoid(
                tf.add(tf.matmul(X, layers_dict["weight_hidden_" + str(id)]),
                       layers_dict["bias_hidden_" + str(id)]))
        else:
            layers_dict["hidden_layer_" + str(id)] = tf.sigmoid(
                tf.add(
                    tf.matmul(layers_dict["hidden_layer_" + str(id - 1)],
                              layers_dict["weight_hidden_" + str(id)]),
                    layers_dict["bias_hidden_" + str(id)]))

    # Output layer
    layers_dict["output_layer"] = tf.abs(
        tf.transpose(
            tf.add(
                tf.matmul(layers_dict["hidden_layer_" + str(len(neurons) - 1)],
                          layers_dict["weight_out"]),
                layers_dict["bias_out"])))

    #Cost_function
    mse = tf.sqrt(
        tf.reduce_mean(tf.squared_difference(layers_dict["output_layer"], Y)))

    # Optimizer
    opt = tf.train.AdamOptimizer(learning_rate).minimize(mse)

    # Init
    sess.run(tf.global_variables_initializer())

    return ((X, Y, sess, opt, mse, layers_dict))