def evolved_transformer_base_tpu(): """Base parameters for Evolved Transformer model on TPU.""" hparams = add_evolved_transformer_hparams(transformer.transformer_tpu()) hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5 hparams.learning_rate_schedule = ( "constant*single_cycle_cos_decay") return hparams
def afx_small(): """Small transformer model with small batch size for fast step times.""" hparams = transformer.transformer_tpu() hparams.filter_size = 1024 hparams.num_heads = 4 hparams.num_hidden_layers = 3 hparams.batch_size = 512 return hparams
def afx_small(): """Small transformer model with small batch size for fast step times.""" hparams = transformer.transformer_tpu() hparams.filter_size = 1024 hparams.num_heads = 4 hparams.num_hidden_layers = 3 hparams.batch_size = 512 return hparams
def lmx_base(): """Transformer on languagemodel_lm1b32k_packed. 50M Params.""" hparams = transformer.transformer_tpu() # sharing is counterproductive when underparameterized hparams.shared_embedding_and_softmax_weights = False # we judge by log-ppl, so label smoothing hurts. hparams.label_smoothing = 0.0 # This makes the batch size on GPU the same as on TPU for a packed problem # with sequence length 256. # TODO(noam): fix the mess that is the data reading pipeline. hparams.max_length = 256 # larger batch since we only have a decoder hparams.batch_size = 4096 # save some memory so we can have a larger model hparams.activation_dtype = "bfloat16" return hparams
def lmx_base(): """Transformer on languagemodel_lm1b32k_packed. 50M Params.""" hparams = transformer.transformer_tpu() # sharing is counterproductive when underparameterized hparams.shared_embedding_and_softmax_weights = False # we judge by log-ppl, so label smoothing hurts. hparams.label_smoothing = 0.0 # This makes the batch size on GPU the same as on TPU for a packed problem # with sequence length 256. # TODO(noam): fix the mess that is the data reading pipeline. hparams.max_length = 256 # larger batch since we only have a decoder hparams.batch_size = 4096 # save some memory so we can have a larger model hparams.activation_dtype = "bfloat16" return hparams
def neural_assistant_base(): """HParams for a base neural_assistant model.""" hparams = transformer.transformer_tpu() hparams.add_hparam("pos_weight", 1.0) # weight for positive triples hparams.add_hparam("similarity_fuction", "bilinear") # dot_product or bilinear hparams.add_hparam("pool_technique", "average") # avg or max pool or last hparams.add_hparam("last_k", 1) # number of last indices for averaging hparams.add_hparam("max_triple_length", 30) # max length of every triple hparams.add_hparam("train_triple_num", 5000) # max number of triples during training hparams.add_hparam("attend_kb", True) # if False, it's a transformer model hparams.add_hparam("kb_loss_weight", 0.0) # weight for distant supervision hparams.add_hparam("test_triple_num", 28483) # max triples of KB hparams.add_hparam("margin", 0.0) # KB training max-margin loss hparams.add_hparam( "num_negative_samples", 1) # Sampling number of different adversarial training examples hparams.add_hparam("kb_train_weight", 0.0) # KB_training loss weight which combines Language model and KB selection loss return hparams