Пример #1
0
def transformer_base_v2_fake_replicas():
    hparams = transformer_base_v2()
    hparams.learning_rate_warmup_steps *= 8
    hparams.learning_rate /= math.sqrt(8.0)
    hparams.max_length = 150
    hparams.batch_size = 8192
    return hparams
Пример #2
0
def transformer_base_gradout():
    """Transformer with GradOut loss.
  
  Requires the T2T fork from https://github.com/fstahlberg/tensor2tensor
  """
    hparams = transformer_base_v2()
    hparams.target_modality = "symbol:gradout"
    return hparams
Пример #3
0
def transformer_base_gibbs_large_batch8():
    """Replication of Vaswani et al., 2017 on a single 12GB gpu."""
    hparams = transformer_base_v2()
    hparams.optimizer_multistep_accumulate_steps = 8
    hparams.add_hparam("gibbs_self_attention_independence_length", 1)
    hparams.optimizer = "MultistepAdam"
    hparams.num_decoder_layers = 1
    hparams.label_smoothing = 0.0
    return hparams
Пример #4
0
def transformer_bidirectional_large_batch8():
    """Replication of Vaswani et al., 2017 on a single 12GB gpu."""
    hparams = transformer_base_v2()
    hparams.optimizer_multistep_accumulate_steps = 8
    hparams.optimizer = "MultistepAdam"
    hparams.num_decoder_layers = 4
    hparams.label_smoothing = 0.0
    hparams.add_hparam("num_bidirectional_decoder_joint_layers", 2)
    return hparams
Пример #5
0
def transformer_base_gradout_large_batch():
    """Transformer with GradOut loss.
  
  Requires the T2T fork from https://github.com/fstahlberg/tensor2tensor
  """
    hparams = transformer_base_v2()
    hparams.target_modality = "symbol:gradout"
    hparams.optimizer_multistep_accumulate_steps = 8
    hparams.optimizer = "MultistepAdam"
    return hparams
Пример #6
0
def afx_adam():
    """Old version - Adam."""
    hparams = transformer.transformer_base_v2()
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.999
    hparams.symbol_modality_num_shards = 1
    hparams.batch_size = 2048
    hparams.optimizer = "Adam"
    hparams.learning_rate_schedule = (
        "constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size")
    hparams.learning_rate_constant = 2.0
    return hparams
def afx_adam():
  """Old version - Adam."""
  hparams = transformer.transformer_base_v2()
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.999
  hparams.symbol_modality_num_shards = 1
  hparams.batch_size = 2048
  hparams.optimizer = "Adam"
  hparams.learning_rate_schedule = (
      "constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size")
  hparams.learning_rate_constant = 2.0
  return hparams
Пример #8
0
def transformer_chat():
    hparams = transformer.transformer_base_v2()
    hparams.num_hidden_layers = 6  # 2
    hparams.hidden_size = 512  # 128
    hparams.filter_size = 2048  # 512
    hparams.num_heads = 8  #4
    hparams.attention_dropout = 0.6
    hparams.layer_prepostprocess_dropout = 0.1  #0.6
    hparams.learning_rate = 0.05
    #hparams.learning_rate_constant = 0.05
    hparams.learning_rate_schedule = 'legacy'
    return hparams
Пример #9
0
def transformer_sim_net_tiny():
    hparams = transformer.transformer_base_v2()
    hparams.optimizer_adam_beta2 = 0.997
    hparams.optimizer = "Adam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.learning_rate_schedule = ("linear_warmup*legacy")
    hparams.num_hidden_layers = 4
    hparams.hidden_size = 256
    hparams.filter_size = 512
    hparams.num_heads = 4
    hparams.batch_size = 4096
    hparams.add_hparam("data_ratio", 4)
    return hparams
Пример #10
0
def transformer_base_2ens_simplefusion():
    """Transformer with Simplefusion.
  
  Requires the T2T fork from https://github.com/fstahlberg/tensor2tensor
  """
    hparams = transformer_base_v2()
    hparams.target_modality = "symbol:simplefusion"
    hparams.input_modalities = "inputs:symbol:simplefusion"
    hparams.add_hparam(
        "ensemble_fusion_mode",
        "share_embeddings")  # prenorm, postnorm, share_embeddings
    hparams.add_hparam("ensemble_hidden_sizes", [512, 512])
    hparams.hidden_size = 512
    hparams.add_hparam("ensemble_enabled", [True, True])
    hparams.add_hparam("ensemble_trainable", [True, True])
    hparams.add_hparam("ensemble_is_lm", [False, False])
    hparams.add_hparam("ensemble_models", ["transformer", "transformer"])
    return hparams
Пример #11
0
def transformer_base_v2_small_lr():
    hparams = transformer_base_v2()
    hparams.learning_rate /= math.sqrt(8.0)
    hparams.max_length = 150
    hparams.batch_size = 8192
    return hparams
Пример #12
0
def transformer_base_v2_large_batch32():
    hparams = transformer_base_v2()
    hparams.optimizer_multistep_accumulate_steps = 32
    hparams.optimizer = "MultistepAdam"
    return hparams
Пример #13
0
def transformer_base_v2_large_batch8():
    """Replication of Vaswani et al., 2017 on a single 12GB gpu."""
    hparams = transformer_base_v2()
    hparams.optimizer_multistep_accumulate_steps = 8
    hparams.optimizer = "MultistepAdam"
    return hparams