def transformer_sketch_ranged(rhp): """Range of hparams for vizier.""" hparams = transformer_sketch() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_categorical("ffn_layer", ["conv_hidden_relu_with_sepconv", "conv_hidden_relu"]) rhp.set_discrete("batch_size", [1024, 2048, 4096]) rhp.set_discrete("num_hidden_layers", [2, 3, 4, 5, 6]) rhp.set_discrete("hidden_size", [32, 64, 128, 256, 512, 1024], scale=rhp.LOG_SCALE) rhp.set_discrete("kernel_height", [1, 3, 5, 7]) rhp.set_discrete("kernel_width", [1, 3, 5, 7]) rhp.set_discrete("compress_steps", [0, 1, 2]) rhp.set_float("dropout", 0.0, 0.5) rhp.set_float("weight_decay", 1e-4, .03, scale=rhp.LOG_SCALE) rhp.set_float("label_smoothing", 0.0, 0.2) rhp.set_float("clip_grad_norm", 0.01, 8.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.1, 1.0, scale=rhp.LOG_SCALE) rhp.set_categorical("initializer", ["uniform", "orthogonal", "uniform_unit_scaling"]) rhp.set_float("initializer_gain", 0.5, 3.5) rhp.set_categorical("learning_rate_decay_scheme", ["none", "sqrt", "noam", "exp10k"]) rhp.set_float("optimizer_adam_epsilon", 1e-7, 1e-2, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta1", 0.8, 0.9) rhp.set_float("optimizer_adam_beta2", 0.995, 0.999) rhp.set_categorical("optimizer", [ "Adam", "Adagrad", "Momentum", "RMSProp", "SGD", "YellowFin"])
def slicenet_range1(ranged_hparams): """Small range of hyperparameters.""" rhp = ranged_hparams hparams = slicenet_params1() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) rhp.set_float("weight_decay", 1.0, 5.0)
def transformer_tpu_range(rhp): """Small range of hyperparameters.""" hparams = transformer_tpu() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_discrete("learning_rate_warmup_steps", [1000, 2000, 4000, 8000, 16000]) rhp.set_float("initializer_gain", 0.5, 2.0) rhp.set_float("optimizer_adam_beta1", 0.85, 0.95) rhp.set_float("optimizer_adam_beta2", 0.97, 0.99) rhp.set_float("weight_decay", 0.0, 2.0)
def transformer_tpu_batch_range(rhp): hparams = transformer_tpu() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_discrete("tpu_batch_size_per_shard", [1, 2, 3, 4])
def transformer_tiny_tpu_range(rhp): """Small range of hyperparameters.""" hparams = transformer_tiny_tpu() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) rhp.set_float("weight_decay", 0.0, 2.0)
def transformer_tpu_batch_range(rhp): hparams = transformer_tpu() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_discrete("tpu_batch_size_per_shard", [1] + list(range(2, 16, 2))) rhp.set_discrete("max_length", list(range(128, 416, 16)))