Пример #1
0
def bluenet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 4096
    hparams.hidden_size = 256
    hparams.dropout = 0.2
    hparams.symbol_dropout = 0.5
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 8
    hparams.kernel_height = 3
    hparams.kernel_width = 3
    hparams.learning_rate_decay_scheme = "exp10k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("anneal_until", 40000)
    hparams.add_hparam("batch_deviation_loss_factor", 5.0)
    return hparams
Пример #2
0
def attention_lm_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 2000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 6
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)

    hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("encoder_full_attention", int(False))
    return hparams
Пример #3
0
def shakeshake_cifar10():
  """Parameters for CIFAR-10."""
  hparams = common_hparams.basic_params1()
  # This leads to effective batch size 128 when number of GPUs is 1
  hparams.batch_size = 4096 * 8
  hparams.hidden_size = 16
  hparams.dropout = 0
  hparams.label_smoothing = 0.0
  hparams.clip_grad_norm = 2.0
  hparams.num_hidden_layers = 26
  hparams.kernel_height = -1  # Unused
  hparams.kernel_width = -1  # Unused
  hparams.learning_rate_decay_scheme = "cosine"
  # Model should be run for 700000 steps with batch size 128 (~1800 epochs)
  hparams.learning_rate_cosine_cycle_steps = 700000
  hparams.learning_rate = 0.2
  hparams.learning_rate_warmup_steps = 3000
  hparams.initializer = "uniform_unit_scaling"
  hparams.initializer_gain = 1.0
  # TODO(rshin): Adjust so that effective value becomes ~1e-4
  hparams.weight_decay = 3.0
  hparams.optimizer = "Momentum"
  hparams.optimizer_momentum_momentum = 0.9
  hparams.add_hparam("base_filters", 16)
  hparams.add_hparam("shakeshake_type", "batch")
  return hparams
Пример #4
0
 def testNeuralGPU(self):
     hparams = common_hparams.basic_params1()
     batch_size = 3
     input_length = 5
     target_length = input_length
     input_vocab_size = 9
     target_vocab_size = 11
     p_hparams = problem_hparams.test_problem_hparams(
         input_vocab_size, target_vocab_size)
     inputs = -1 + np.random.random_integers(
         input_vocab_size, size=(batch_size, input_length, 1, 1))
     targets = -1 + np.random.random_integers(
         target_vocab_size, size=(batch_size, target_length, 1, 1))
     with self.test_session() as session:
         features = {
             "inputs": tf.constant(inputs, dtype=tf.int32),
             "targets": tf.constant(targets, dtype=tf.int32)
         }
         model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN,
                                      p_hparams)
         shadred_logits, _ = model.model_fn(features)
         logits = tf.concat(shadred_logits, 0)
         session.run(tf.global_variables_initializer())
         res = session.run(logits)
     self.assertEqual(res.shape,
                      (batch_size, target_length, 1, 1, target_vocab_size))
Пример #5
0
def aligned_base():
    """Set of hyperparameters.

  languagemodel_wiki_scramble1k50, 1gpu, 7k steps (10min): log(ppl)_eval = 2.60
  12.0 steps/sec on P100
  8gpu (8x batch), 7k steps: log(ppl)_eval = 2.00

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 512
    hparams.batch_size = 5000
    hparams.max_length = 0
    hparams.min_length_bucket = 1024
    hparams.dropout = 0.0
    hparams.layer_prepostprocess_dropout = 0.0
    hparams.label_smoothing = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 2000
    hparams.initializer_gain = 1.0
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.shared_embedding_and_softmax_weights = int(True)
    hparams.add_hparam("ffn_hidden_sizes", "2048")  # Add new ones like this.
    hparams.moe_num_experts = 32
    hparams.layer_preprocess_sequence = "n"
    hparams.layer_postprocess_sequence = "da"
    hparams.add_hparam("layers", "timing," + "conv,att,ffn," * 2)

    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    # moe params. local attention moe.
    hparams.add_hparam("attention_local", int(False))
    hparams.add_hparam("attention_moe_k", 2)
    hparams.add_hparam("attention_num_experts", 16)
    hparams.add_hparam("attention_split_batch", int(False))
    # Key, query and value dimensions for the attention
    hparams.add_hparam("attention_kq_size", 128)
    hparams.add_hparam("attention_v_size", 256)
    # Loss coef for load balancing
    hparams.add_hparam("attention_load_balance", 2e-2)
    hparams.add_hparam("diet_experts", int(False))
    hparams.add_hparam("memory_efficient_ffn", int(False))
    hparams.add_hparam("local_attention_window", 128)
    hparams.add_hparam("attention_num_groups", 8)
    hparams.add_hparam("attention_image_summary", int(True))
    return hparams
Пример #6
0
def lstm_seq2seq():
    """hparams for LSTM."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 128
    hparams.num_hidden_layers = 2
    hparams.initializer = "uniform_unit_scaling"
    return hparams
Пример #7
0
def transformer_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.norm_type = "layer"
    hparams.hidden_size = 512
    hparams.batch_size = 4096
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 6
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.1
    hparams.shared_embedding_and_softmax_weights = int(True)

    # Add new ones like this.
    hparams.add_hparam("filter_size", 2048)
    # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
    hparams.add_hparam("num_encoder_layers", 0)
    hparams.add_hparam("num_decoder_layers", 0)
    # Attention-related flags.
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("ffn_layer", "conv_hidden_relu")
    hparams.add_hparam("parameter_attention_key_channels", 0)
    hparams.add_hparam("parameter_attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("nbr_decoder_problems", 1)
    hparams.add_hparam("proximity_bias", int(False))
    hparams.add_hparam("use_pad_remover", int(True))
    hparams.add_hparam("self_attention_type", "dot_product")
    hparams.add_hparam("max_relative_position", 0)
    return hparams
Пример #8
0
def transformer_moe_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.norm_type = "layer"
    hparams.hidden_size = 512
    hparams.batch_size = 4096
    hparams.max_length = 2001
    hparams.max_input_seq_length = 2000
    hparams.max_target_seq_length = 2000
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 5
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(True)

    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("ffn_layer", "conv_hidden_relu")
    hparams.add_hparam("parameter_attention_key_channels", 0)
    hparams.add_hparam("parameter_attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("nbr_decoder_problems", 1)
    hparams.add_hparam("proximity_bias", int(False))
    # FLAGS RELATED TO MIXTURE-OF-EXPERTS
    # comma-separated list of layer numbers.
    # At each of these layers, we replace the ffn with a mixture of experts.
    hparams.add_hparam("moe_layers_encoder", "2")
    hparams.add_hparam("moe_layers_decoder", "2")
    return hparams
Пример #9
0
def multimodel_base():
    """Base parameters for MultiModel."""
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 512
    hparams.batch_size = 2048
    hparams.num_hidden_layers = 4
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 4000
    hparams.initializer_gain = 1.0
    hparams.dropout = 0.1
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    hparams.add_hparam("large_kernel_size", 15)
    hparams.add_hparam("attention_dropout", 0.1)
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("moe_layers", "2")
    hparams.moe_num_experts = 30
    return hparams
Пример #10
0
def slicenet_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.hidden_size = 768
    hparams.dropout = 0.5
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 4
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.norm_type = "layer"
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("large_kernel_size",
                       15)  # New ones are added like this.
    hparams.add_hparam("separability", -2)
    # A dilation scheme, one of _DILATION_SCHEMES.
    hparams.add_hparam("dilation_scheme", "1.1.1.1")
    # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size.
    hparams.add_hparam("kernel_scheme", "3.7.15.31")
    hparams.add_hparam("audio_compression", 8)
    # attention-related flags
    hparams.add_hparam("attention_type", "simple")
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    hparams.add_hparam("sim_loss_mult", 0.0)  # Try 10.0 for experiments.
    hparams.add_hparam("attention_dropout", 0.2)
    hparams.shared_embedding_and_softmax_weights = int(True)
    return hparams
Пример #11
0
def gene_expression_conv_base():
  """Hparams for GeneExpressionConv model."""
  hparams = common_hparams.basic_params1()

  batch_size = 10
  output_length = 2048
  inputs_per_output = 128
  chunk_size = 4
  input_length = output_length * inputs_per_output // chunk_size
  hparams.batch_size = input_length * batch_size

  hparams.dropout = 0.1
  hparams.add_hparam("num_conv_layers", 4)
  hparams.add_hparam("num_dconv_layers", 7)
  # The product of these pooling windows should match
  # input_length/target_length.
  hparams.add_hparam("pooling_windows", [2, 2, 2, 4])

  hparams.hidden_size = 256
  hparams.kernel_width = 20
  hparams.add_hparam("stride", 1)
  return hparams
Пример #12
0
def neural_gpu_params1():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 1024
    hparams.num_hidden_layers = 1
    hparams.hidden_size = 256
    hparams.dropout = 0.1
    hparams.label_smoothing = 0.0
    hparams.clip_grad_norm = 10.0
    hparams.num_hidden_layers = 1
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.02
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 0.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    return hparams
Пример #13
0
 def testLSTMSeq2Seq(self):
     vocab_size = 9
     x = np.random.random_integers(1,
                                   high=vocab_size - 1,
                                   size=(3, 5, 1, 1))
     y = np.random.random_integers(1,
                                   high=vocab_size - 1,
                                   size=(3, 6, 1, 1))
     hparams = common_hparams.basic_params1()
     p_hparams = problem_hparams.test_problem_hparams(
         vocab_size, vocab_size)
     with self.test_session() as session:
         features = {
             "inputs": tf.constant(x, dtype=tf.int32),
             "targets": tf.constant(y, dtype=tf.int32),
         }
         model = lstm.LSTMSeq2seq(hparams, tf.estimator.ModeKeys.TRAIN,
                                  p_hparams)
         sharded_logits, _ = model.model_fn(features)
         logits = tf.concat(sharded_logits, 0)
         session.run(tf.global_variables_initializer())
         res = session.run(logits)
     self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
Пример #14
0
def bytenet_base():
    """Set of hyperparameters."""
    hparams = common_hparams.basic_params1()
    hparams.batch_size = 2048
    hparams.hidden_size = 768
    hparams.dropout = 0.2
    hparams.symbol_dropout = 0.2
    hparams.label_smoothing = 0.1
    hparams.clip_grad_norm = 2.0
    hparams.num_hidden_layers = 4
    hparams.kernel_height = 3
    hparams.kernel_width = 1
    hparams.learning_rate_decay_scheme = "exp50k"
    hparams.learning_rate = 0.05
    hparams.learning_rate_warmup_steps = 3000
    hparams.initializer_gain = 1.0
    hparams.weight_decay = 3.0
    hparams.num_sampled_classes = 0
    hparams.sampling_method = "argmax"
    hparams.optimizer_adam_epsilon = 1e-6
    hparams.optimizer_adam_beta1 = 0.85
    hparams.optimizer_adam_beta2 = 0.997
    hparams.add_hparam("num_block_repeat", 4)
    return hparams
Пример #15
0
def attention_lm_moe_base():
    """Set of hyperparameters.

  suitable for 1 gpu.
  on lm1b_32k:
     ~229M params
     0.9 steps/sec on  [GeForce GTX TITAN X]

  Returns:
    a hparams object
  """
    hparams = common_hparams.basic_params1()
    hparams.hidden_size = 1024
    hparams.batch_size = 8192
    hparams.max_length = 256
    hparams.dropout = 0.0
    hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
    hparams.optimizer_adam_epsilon = 1e-9
    hparams.learning_rate_decay_scheme = "noam"
    hparams.learning_rate = 0.1
    hparams.learning_rate_warmup_steps = 2000
    hparams.initializer_gain = 1.0
    hparams.num_hidden_layers = 4
    hparams.initializer = "uniform_unit_scaling"
    hparams.weight_decay = 0.0
    hparams.optimizer_adam_beta1 = 0.9
    hparams.optimizer_adam_beta2 = 0.98
    hparams.num_sampled_classes = 0
    hparams.label_smoothing = 0.0
    hparams.shared_embedding_and_softmax_weights = int(False)
    hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
    hparams.moe_num_experts = 32
    # attention-related flags
    hparams.add_hparam("num_heads", 8)
    hparams.add_hparam("attention_key_channels", 0)
    hparams.add_hparam("attention_value_channels", 0)
    # All hyperparameters ending in "dropout" are automatically set to 0.0
    # when not in training mode.
    hparams.add_hparam("attention_dropout", 0.0)
    hparams.add_hparam("relu_dropout", 0.0)
    hparams.add_hparam("pos", "timing")  # timing, none
    hparams.add_hparam("moe_layers",
                       "2")  # comma separated list of layer numbers
    # moe params. local attention moe.
    # If attention_layers is set, the num_hidden_layers parameter will be ignored
    # and each caracter of the string will correspond to one attention
    # layer type
    hparams.add_hparam("attention_layers", "")
    hparams.add_hparam("attention_type", AttentionType.MULTIHEAD)
    hparams.add_hparam("attention_local", int(False))
    hparams.add_hparam("attention_moe_k", 2)
    hparams.add_hparam("attention_num_head", 1)
    hparams.add_hparam("attention_num_experts", 16)
    hparams.add_hparam("attention_split_batch", int(False))
    # If attention_exp_factor is set, each input to local_expert_attention (of
    # dimensionality hidden size) is projected into attention_exp_factor smaller
    # inputs, each of dimensionality attention_exp_inputdim. (otherwise
    # attention_exp_inputdim is ignored)
    hparams.add_hparam("attention_exp_factor", 0)
    hparams.add_hparam("attention_exp_inputdim", 128)
    # Key, query and value dimensions for the attention
    hparams.add_hparam("attention_kq_size", 128)
    hparams.add_hparam("attention_v_size", 256)
    # Loss coef for load balancing
    hparams.add_hparam("attention_load_balance", 2e-2)
    hparams.add_hparam("use_sepconv", int(False))
    hparams.add_hparam("diet_experts", int(False))
    hparams.add_hparam("memory_efficient_ffn", int(False))
    # if True, we learn a non-autoregressive model from "inputs" to "targets".
    # if False, we learn an autoregressive model to generate "targets"
    hparams.add_hparam("use_inputs", int(False))
    return hparams