def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams