示例#1
0
def create_model( arch, batch_size, dtype, optimizer ):
  with tf.device('gpu:0'):
    model, features = get_model(transformer.transformer_big(), batch_size=batch_size)
    out_logits, _ = model(features)
    out_logits = tf.squeeze(out_logits, axis=[2, 3])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
        labels=tf.reshape(features["targets"], [-1]))
    loss = tf.reduce_mean(loss)
    if optimizer == 'GD':
        train_op = tf.train.GradientDescentOptimizer(1e-3).minimize(loss)
    elif optimizer == 'Adam':
        train_op = tf.train.AdamOptimizer(1e-3).minimize(loss)
    init = tf.initializers.global_variables()
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print('total num. of parameters: %d' % total_parameters)
  print(tf.trainable_variables())
  return train_op, init
def evolved_transformer_deep():
  """Deep parameters for Evolved Transformer model on WMT."""
  hparams = add_evolved_transformer_hparams(transformer.transformer_big())
  hparams.num_encoder_layers = 9
  hparams.num_decoder_layers = 10
  hparams.hidden_size = 640
  return hparams
示例#3
0
def transformer_delib_big_v2():
    """HParams for transfomer big delibnet model on WMT."""
    hparams = transformer.transformer_big()
    hparams.add_hparam("delib_layers", "")
    hparams.add_hparam("update_delib_only", True)
    hparams.shared_embedding_and_softmax_weights = int(False)
    return hparams
示例#4
0
def transformer_tall9():
    hparams = transformer.transformer_big()
    hparams.hidden_size = 768
    hparams.filter_size = 3072
    hparams.num_hidden_layers = 9
    hparams.num_heads = 12
    return hparams
示例#5
0
文件: model.py 项目: thtrieu/dab
def transformer_tall9():
    hparams = transformer.transformer_big()
    hparams.hidden_size = 768
    hparams.filter_size = 3072
    hparams.num_hidden_layers = 9
    hparams.num_heads = 12
    hparams.add_hparam("extra_tokens", 8)
    return hparams
示例#6
0
def zhen_wmt17_transformer_rl_delta_setting():
    # beam search + reward shaping
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.rl = True
    hparams.delta_reward = True  # reward shaping
    return hparams
示例#7
0
def transformer_textclass_big():
    hparams = transformer.transformer_big()
    hparams.layer_prepostprocess_dropout = 0.1
    hparams.learning_rate_warmup_steps = 50
    hparams.learning_rate_constant = 6.25e-6
    hparams.learning_rate_schedule = ("linear_warmup*constant*linear_decay")
    # Set train steps to learning_rate_decay_steps or less
    hparams.learning_rate_decay_steps = 20000
    return hparams
示例#8
0
def zhen_wmt17_transformer_rl_total_setting():
    # beam search + terminal reward
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.rl = True
    hparams.delta_reward = False  # terminal reward
    return hparams
示例#9
0
def eten_transformer_rl_total_setting_random():
    # multinomial sampling + terminal reward
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.sampling_method = "random"  # multinomial sampling
    hparams.rl = True
    hparams.delta_reward = False  # terminal reward
    return hparams
示例#10
0
def transformer_revnet_base():
    """Base hparams for TransformerRevnet."""
    hparams = transformer.transformer_big()

    # Use settings from transformer_n_da
    hparams.layer_preprocess_sequence = "n"
    hparams.layer_postprocess_sequence = "da"
    hparams.learning_rate = 0.4

    return hparams
示例#11
0
def zhen_wmt17_transformer_rl_delta_setting_random():
    # multinomial sampling + reward shaping
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.sampling_method = "random"  # multinomial sampling
    hparams.rl = True
    hparams.delta_reward = True  # reward shaping
    return hparams
def transformer_revnet_base():
  """Base hparams for TransformerRevnet."""
  hparams = transformer.transformer_big()

  # Use settings from transformer_n_da
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"
  hparams.learning_rate = 0.4

  return hparams
示例#13
0
def zhen_wmt17_transformer_rl_delta_setting_random_baseline():
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.sampling_method = "random"
    hparams.baseline_loss_weight = 1.0
    hparams.training_loss_weight = 0.0
    hparams.rl = True
    hparams.delta_reward = True
    return hparams
示例#14
0
def eten_transformer_rl_delta_setting_random_mrt():
    # multinomial sampling + reward shaping + mrt
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.sampling_method = "random"  # multinomial sampling
    hparams.mrt_samples = 50  # mrt samples candidates num
    hparams.rl = True
    hparams.delta_reward = True  # reward shaping
    return hparams
示例#15
0
def eten_transformer_rl_delta_setting_random_mle():
    hparams = transformer.transformer_big()
    hparams.shared_embedding_and_softmax_weights = 0
    hparams.layer_prepostprocess_dropout = 0.05
    hparams.learning_rate = 0.1
    hparams.sampling_method = "random"
    hparams.combine_mle = True
    hparams.mle_training_loss_weight = 0.3
    hparams.training_loss_weight = 0.7
    hparams.rl = True
    hparams.delta_reward = True
    return hparams
示例#16
0
def transformer_big_gec():
    hparams = transformer_big()
    hparams.max_length = 150
    hparams.batch_size = 2048
    hparams.learning_rate = 0.0002
    hparams.learning_rate_warmup_steps = 8000
    hparams.layer_prepostprocess_dropout = 0.3
    hparams.attention_dropout = 0.1
    hparams.relu_dropout = 0.1
    hparams.label_smoothing = 0.1
    hparams.learning_rate_schedule = (
        "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size")
    hparams.shared_embedding_and_softmax_weights = True

    return hparams
示例#17
0
def sparse_transformer_imagenet64x64():
    """HParams for training image_imagenet64_gen_flat_rev."""
    hparams = transformer.transformer_big()

    hparams.num_heads = 8
    hparams.max_length = 64 * 64 * 3

    # Batch size refers to examples (not tokens).
    hparams.batch_size = 1
    hparams.shared_embedding_and_softmax_weights = False

    hparams.num_hidden_layers = 3
    hparams.attention_dropout = 0.1
    hparams.layer_prepostprocess_dropout = 0.2
    hparams.relu_dropout = 0.1
    hparams.label_smoothing = 0.0

    ##
    ### Memory usage & TPU hparams.
    ##

    # Adafactor uses less memory than Adam. Switch to Adafactor with
    # its recommended learning rate scheme.
    hparams.optimizer = "Adafactor"
    hparams.learning_rate_schedule = "rsqrt_decay"
    hparams.learning_rate_warmup_steps = 10000

    # Using noise broadcast in the dropout layers saves memory during training.
    hparams.attention_dropout_broadcast_dims = "0,1"  # batch, heads
    hparams.relu_dropout_broadcast_dims = "1"  # length
    hparams.layer_prepostprocess_dropout_broadcast_dims = "1"  # length

    # Avoid an expensive concat on TPU.
    hparams.symbol_modality_num_shards = 1

    hparams.add_hparam("sparse_attention_mode", "masked")
    hparams.add_hparam("sparse_attention_type", "band_and_decay")
    hparams.add_hparam("band_size", 256)
    hparams.add_hparam("sparsity", 0.95)
    return hparams
示例#18
0
    def encode(self, indexed_batch, mask):
        """Take in a batch of encoded sentences formed of word indices, return a batch of sentence vectors."""

        word_embeddings = tf.get_variable(
            name='word_embeddings',
            shape=[FLAGS.vocab_size, FLAGS.wembedding_size],
            initializer=tf.random_uniform_initializer(),
            trainable=True)

        # Mask the padded word embeddings
        words = tf.nn.embedding_lookup(word_embeddings, indexed_batch)
        words = tf.multiply(
            words, tf.cast(tf.expand_dims(mask, -1), dtype=tf.float32))
        words = tf.expand_dims(words, 1)

        transformer_params = transformer.transformer_big()
        transformer_params.num_heads = 5
        transformer_params.hidden_size = FLAGS.wembedding_size

        # Transformer encoder outputs shape [BatchSize MaxLength HiddenSize]
        tfmr = transformer.Transformer(transformer_params,
                                       mode=tf.estimator.ModeKeys.TRAIN)
        target_space_id = tf.constant(1, dtype=tf.int32)
        encoder_output, _ = tfmr.encode(words, target_space_id,
                                        transformer_params)

        # Use a linear transform to map onto shape [BatchSize, SentenceEmbeddingSize]
        encoder_output = tf.reshape(encoder_output, [FLAGS.batch_size, -1])
        matrix_shape = [
            FLAGS.wembedding_size * FLAGS.max_sentence_length,
            FLAGS.embedding_size
        ]
        matrix = tf.random_normal(matrix_shape,
                                  dtype=tf.float32,
                                  name='linear_layer')
        linear_transform = tf.Variable(matrix)
        sentence_embeddings = tf.matmul(encoder_output, linear_transform)
        return sentence_embeddings
示例#19
0
def transformer_big_large_batch128():
    hparams = transformer_big()
    hparams.batch_size = 1024
    hparams.optimizer_multistep_accumulate_steps = 128
    hparams.optimizer = "MultistepAdam"
    return hparams
示例#20
0
def universal_transformer_base1():
  hparams = transformer.transformer_big()
  hparams = update_hparams_for_universal_transformer(hparams)
  return hparams
示例#21
0
def universal_transformer_big1():
  hparams = transformer.transformer_big()
  hparams = update_hparams_for_universal_transformer(hparams)
  hparams.hidden_size = 2048
  hparams.filter_size = 8192
  return hparams
示例#22
0
def transformer_big_bs1():
    hparams = transformer.transformer_big()
    hparams.add_hparam("block_size", 1)
    return hparams
def universal_transformer_big():
  hparams = transformer.transformer_big()
  hparams = update_hparams_for_universal_transformer(hparams)
  hparams.hidden_size = 2048
  hparams.filter_size = 8192
  return hparams
def universal_transformer_base():
  hparams = transformer.transformer_big()
  hparams = update_hparams_for_universal_transformer(hparams)
  return hparams
示例#25
0
def r_transformer_big():
    hparams = transformer.transformer_big()
    hparams = update_hparams_for_r_transformer(hparams)
    return hparams
def evolved_transformer_big():
    """Big parameters for Evolved Transformer model on WMT."""
    return add_evolved_transformer_hparams(transformer.transformer_big())
示例#27
0
def transformer_big_large_batch2():
    hparams = transformer_big()
    #hparams.batch_size = 2048
    hparams.optimizer_multistep_accumulate_steps = 2
    hparams.optimizer = "MultistepAdam"
    return hparams
示例#28
0
def universal_transformer_base_tpu():
    hparams = transformer.transformer_big()
    hparams = update_hparams_for_universal_transformer(hparams)
    transformer.update_hparams_for_tpu(hparams)
    hparams.add_step_timing_signal = False
    return hparams