def create_model( arch, batch_size, dtype, optimizer ): with tf.device('gpu:0'): model, features = get_model(transformer.transformer_big(), batch_size=batch_size) out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) if optimizer == 'GD': train_op = tf.train.GradientDescentOptimizer(1e-3).minimize(loss) elif optimizer == 'Adam': train_op = tf.train.AdamOptimizer(1e-3).minimize(loss) init = tf.initializers.global_variables() total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters print('total num. of parameters: %d' % total_parameters) print(tf.trainable_variables()) return train_op, init
def evolved_transformer_deep(): """Deep parameters for Evolved Transformer model on WMT.""" hparams = add_evolved_transformer_hparams(transformer.transformer_big()) hparams.num_encoder_layers = 9 hparams.num_decoder_layers = 10 hparams.hidden_size = 640 return hparams
def transformer_delib_big_v2(): """HParams for transfomer big delibnet model on WMT.""" hparams = transformer.transformer_big() hparams.add_hparam("delib_layers", "") hparams.add_hparam("update_delib_only", True) hparams.shared_embedding_and_softmax_weights = int(False) return hparams
def transformer_tall9(): hparams = transformer.transformer_big() hparams.hidden_size = 768 hparams.filter_size = 3072 hparams.num_hidden_layers = 9 hparams.num_heads = 12 return hparams
def transformer_tall9(): hparams = transformer.transformer_big() hparams.hidden_size = 768 hparams.filter_size = 3072 hparams.num_hidden_layers = 9 hparams.num_heads = 12 hparams.add_hparam("extra_tokens", 8) return hparams
def zhen_wmt17_transformer_rl_delta_setting(): # beam search + reward shaping hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.rl = True hparams.delta_reward = True # reward shaping return hparams
def transformer_textclass_big(): hparams = transformer.transformer_big() hparams.layer_prepostprocess_dropout = 0.1 hparams.learning_rate_warmup_steps = 50 hparams.learning_rate_constant = 6.25e-6 hparams.learning_rate_schedule = ("linear_warmup*constant*linear_decay") # Set train steps to learning_rate_decay_steps or less hparams.learning_rate_decay_steps = 20000 return hparams
def zhen_wmt17_transformer_rl_total_setting(): # beam search + terminal reward hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.rl = True hparams.delta_reward = False # terminal reward return hparams
def eten_transformer_rl_total_setting_random(): # multinomial sampling + terminal reward hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.sampling_method = "random" # multinomial sampling hparams.rl = True hparams.delta_reward = False # terminal reward return hparams
def transformer_revnet_base(): """Base hparams for TransformerRevnet.""" hparams = transformer.transformer_big() # Use settings from transformer_n_da hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" hparams.learning_rate = 0.4 return hparams
def zhen_wmt17_transformer_rl_delta_setting_random(): # multinomial sampling + reward shaping hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.sampling_method = "random" # multinomial sampling hparams.rl = True hparams.delta_reward = True # reward shaping return hparams
def transformer_revnet_base(): """Base hparams for TransformerRevnet.""" hparams = transformer.transformer_big() # Use settings from transformer_n_da hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" hparams.learning_rate = 0.4 return hparams
def zhen_wmt17_transformer_rl_delta_setting_random_baseline(): hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.sampling_method = "random" hparams.baseline_loss_weight = 1.0 hparams.training_loss_weight = 0.0 hparams.rl = True hparams.delta_reward = True return hparams
def eten_transformer_rl_delta_setting_random_mrt(): # multinomial sampling + reward shaping + mrt hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.sampling_method = "random" # multinomial sampling hparams.mrt_samples = 50 # mrt samples candidates num hparams.rl = True hparams.delta_reward = True # reward shaping return hparams
def eten_transformer_rl_delta_setting_random_mle(): hparams = transformer.transformer_big() hparams.shared_embedding_and_softmax_weights = 0 hparams.layer_prepostprocess_dropout = 0.05 hparams.learning_rate = 0.1 hparams.sampling_method = "random" hparams.combine_mle = True hparams.mle_training_loss_weight = 0.3 hparams.training_loss_weight = 0.7 hparams.rl = True hparams.delta_reward = True return hparams
def transformer_big_gec(): hparams = transformer_big() hparams.max_length = 150 hparams.batch_size = 2048 hparams.learning_rate = 0.0002 hparams.learning_rate_warmup_steps = 8000 hparams.layer_prepostprocess_dropout = 0.3 hparams.attention_dropout = 0.1 hparams.relu_dropout = 0.1 hparams.label_smoothing = 0.1 hparams.learning_rate_schedule = ( "constant*linear_warmup*rsqrt_decay*rsqrt_hidden_size") hparams.shared_embedding_and_softmax_weights = True return hparams
def sparse_transformer_imagenet64x64(): """HParams for training image_imagenet64_gen_flat_rev.""" hparams = transformer.transformer_big() hparams.num_heads = 8 hparams.max_length = 64 * 64 * 3 # Batch size refers to examples (not tokens). hparams.batch_size = 1 hparams.shared_embedding_and_softmax_weights = False hparams.num_hidden_layers = 3 hparams.attention_dropout = 0.1 hparams.layer_prepostprocess_dropout = 0.2 hparams.relu_dropout = 0.1 hparams.label_smoothing = 0.0 ## ### Memory usage & TPU hparams. ## # Adafactor uses less memory than Adam. Switch to Adafactor with # its recommended learning rate scheme. hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 # Using noise broadcast in the dropout layers saves memory during training. hparams.attention_dropout_broadcast_dims = "0,1" # batch, heads hparams.relu_dropout_broadcast_dims = "1" # length hparams.layer_prepostprocess_dropout_broadcast_dims = "1" # length # Avoid an expensive concat on TPU. hparams.symbol_modality_num_shards = 1 hparams.add_hparam("sparse_attention_mode", "masked") hparams.add_hparam("sparse_attention_type", "band_and_decay") hparams.add_hparam("band_size", 256) hparams.add_hparam("sparsity", 0.95) return hparams
def encode(self, indexed_batch, mask): """Take in a batch of encoded sentences formed of word indices, return a batch of sentence vectors.""" word_embeddings = tf.get_variable( name='word_embeddings', shape=[FLAGS.vocab_size, FLAGS.wembedding_size], initializer=tf.random_uniform_initializer(), trainable=True) # Mask the padded word embeddings words = tf.nn.embedding_lookup(word_embeddings, indexed_batch) words = tf.multiply( words, tf.cast(tf.expand_dims(mask, -1), dtype=tf.float32)) words = tf.expand_dims(words, 1) transformer_params = transformer.transformer_big() transformer_params.num_heads = 5 transformer_params.hidden_size = FLAGS.wembedding_size # Transformer encoder outputs shape [BatchSize MaxLength HiddenSize] tfmr = transformer.Transformer(transformer_params, mode=tf.estimator.ModeKeys.TRAIN) target_space_id = tf.constant(1, dtype=tf.int32) encoder_output, _ = tfmr.encode(words, target_space_id, transformer_params) # Use a linear transform to map onto shape [BatchSize, SentenceEmbeddingSize] encoder_output = tf.reshape(encoder_output, [FLAGS.batch_size, -1]) matrix_shape = [ FLAGS.wembedding_size * FLAGS.max_sentence_length, FLAGS.embedding_size ] matrix = tf.random_normal(matrix_shape, dtype=tf.float32, name='linear_layer') linear_transform = tf.Variable(matrix) sentence_embeddings = tf.matmul(encoder_output, linear_transform) return sentence_embeddings
def transformer_big_large_batch128(): hparams = transformer_big() hparams.batch_size = 1024 hparams.optimizer_multistep_accumulate_steps = 128 hparams.optimizer = "MultistepAdam" return hparams
def universal_transformer_base1(): hparams = transformer.transformer_big() hparams = update_hparams_for_universal_transformer(hparams) return hparams
def universal_transformer_big1(): hparams = transformer.transformer_big() hparams = update_hparams_for_universal_transformer(hparams) hparams.hidden_size = 2048 hparams.filter_size = 8192 return hparams
def transformer_big_bs1(): hparams = transformer.transformer_big() hparams.add_hparam("block_size", 1) return hparams
def universal_transformer_big(): hparams = transformer.transformer_big() hparams = update_hparams_for_universal_transformer(hparams) hparams.hidden_size = 2048 hparams.filter_size = 8192 return hparams
def universal_transformer_base(): hparams = transformer.transformer_big() hparams = update_hparams_for_universal_transformer(hparams) return hparams
def r_transformer_big(): hparams = transformer.transformer_big() hparams = update_hparams_for_r_transformer(hparams) return hparams
def evolved_transformer_big(): """Big parameters for Evolved Transformer model on WMT.""" return add_evolved_transformer_hparams(transformer.transformer_big())
def transformer_big_large_batch2(): hparams = transformer_big() #hparams.batch_size = 2048 hparams.optimizer_multistep_accumulate_steps = 2 hparams.optimizer = "MultistepAdam" return hparams
def universal_transformer_base_tpu(): hparams = transformer.transformer_big() hparams = update_hparams_for_universal_transformer(hparams) transformer.update_hparams_for_tpu(hparams) hparams.add_step_timing_signal = False return hparams