def testNeuralGPU(self): hparams = common_hparams.basic_params1() batch_size = 3 input_length = 5 target_length = input_length input_vocab_size = 9 target_vocab_size = 11 p_hparams = problem_hparams.test_problem_hparams(input_vocab_size, target_vocab_size) inputs = -1 + np.random.random_integers( input_vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( target_vocab_size, size=(batch_size, target_length, 1, 1)) with self.test_session() as session: features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) logits, _ = model(features) session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (batch_size, target_length, 1, 1, target_vocab_size))
def autoencoder_basic(): """Basic autoencoder model.""" hparams = common_hparams.basic_params1() hparams.optimizer = "Adam" hparams.learning_rate_constant = 0.0002 hparams.learning_rate_warmup_steps = 500 hparams.learning_rate_schedule = "constant * linear_warmup" hparams.label_smoothing = 0.0 hparams.batch_size = 128 hparams.hidden_size = 64 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.kernel_height = 4 hparams.kernel_width = 4 hparams.dropout = 0.1 hparams.add_hparam("max_hidden_size", 1024) hparams.add_hparam("bottleneck_bits", 128) hparams.add_hparam("bottleneck_noise", 0.1) hparams.add_hparam("bottleneck_warmup_steps", 3000) hparams.add_hparam("bottleneck_max_prob", 1.0) hparams.add_hparam("sample_height", 32) hparams.add_hparam("sample_width", 32) hparams.add_hparam("discriminator_batchnorm", True) hparams.add_hparam("num_sliced_vecs", 4096) hparams.add_hparam("gan_loss_factor", 0.0) return hparams
def ppo_base_v1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.learning_rate = 1e-4 hparams.add_hparam("init_mean_factor", 0.1) hparams.add_hparam("init_logstd", 0.1) hparams.add_hparam("policy_layers", (100, 100)) hparams.add_hparam("value_layers", (100, 100)) hparams.add_hparam("clipping_coef", 0.2) hparams.add_hparam("gae_gamma", 0.99) hparams.add_hparam("gae_lambda", 0.95) hparams.add_hparam("entropy_loss_coef", 0.01) hparams.add_hparam("value_loss_coef", 1) hparams.add_hparam("optimization_epochs", 15) hparams.add_hparam("epoch_length", 200) hparams.add_hparam("epochs_num", 2000) hparams.add_hparam("eval_every_epochs", 10) hparams.add_hparam("save_models_every_epochs", 30) hparams.add_hparam("optimization_batch_size", 50) hparams.add_hparam("max_gradients_norm", 0.5) hparams.add_hparam("intrinsic_reward_scale", 0.) hparams.add_hparam("logits_clip", 0.0) hparams.add_hparam("dropout_ppo", 0.1) hparams.add_hparam("effective_num_agents", None) return hparams
def ppo_base_v1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.learning_rate = 1e-4 hparams.add_hparam("init_mean_factor", 0.1) hparams.add_hparam("init_logstd", 0.1) hparams.add_hparam("policy_layers", (100, 100)) hparams.add_hparam("value_layers", (100, 100)) hparams.add_hparam("num_agents", 30) hparams.add_hparam("clipping_coef", 0.2) hparams.add_hparam("gae_gamma", 0.99) hparams.add_hparam("gae_lambda", 0.95) hparams.add_hparam("entropy_loss_coef", 0.01) hparams.add_hparam("value_loss_coef", 1) hparams.add_hparam("optimization_epochs", 15) hparams.add_hparam("epoch_length", 200) hparams.add_hparam("epochs_num", 2000) hparams.add_hparam("eval_every_epochs", 10) hparams.add_hparam("num_eval_agents", 3) hparams.add_hparam("video_during_eval", False) hparams.add_hparam("save_models_every_epochs", 30) hparams.add_hparam("optimization_batch_size", 50) hparams.add_hparam("max_gradients_norm", 0.5) hparams.add_hparam("simulated_environment", False) hparams.add_hparam("simulation_random_starts", False) hparams.add_hparam("intrinsic_reward_scale", 0.) return hparams
def revnet_base(): """Default hparams for Revnet.""" hparams = common_hparams.basic_params1() hparams.add_hparam('num_channels', [64, 128, 256, 416]) hparams.add_hparam('num_layers_per_block', [1, 1, 10, 1]) hparams.add_hparam('bottleneck', True) hparams.add_hparam('first_batch_norm', [False, True, True, True]) hparams.add_hparam('init_stride', 2) hparams.add_hparam('init_kernel_size', 7) hparams.add_hparam('init_maxpool', True) hparams.add_hparam('strides', [1, 2, 2, 2]) hparams.add_hparam('num_channels_init_block', 64) hparams.add_hparam('dim', '2d') # Variable init hparams.initializer = 'normal_unit_scaling' hparams.initializer_gain = 2. # Optimization hparams.optimizer = 'Momentum' hparams.optimizer_momentum_momentum = 0.9 hparams.optimizer_momentum_nesterov = True hparams.weight_decay = 1e-4 hparams.clip_grad_norm = 0.0 # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.) hparams.learning_rate = 0.4 hparams.learning_rate_decay_scheme = 'cosine' # For image_imagenet224, 120k training steps, which effectively makes this a # cosine decay (i.e. no cycles). hparams.learning_rate_cosine_cycle_steps = 120000 # Can run with a batch size of 128 with Problem ImageImagenet224 hparams.batch_size = 128 return hparams
def attention_lm_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = False hparams.add_hparam("filter_size", 4096) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("encoder_full_attention", False) return hparams
def resnet_base(): """Set of hyperparameters.""" # For imagenet on TPU: # Set train_steps=120000 # Set eval_steps=48 # Base hparams = common_hparams.basic_params1() # Model-specific parameters hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512]) hparams.add_hparam("block_fn", "bottleneck") hparams.add_hparam("use_nchw", True) # Variable init hparams.initializer = "normal_unit_scaling" hparams.initializer_gain = 2. # Optimization hparams.optimizer = "Momentum" hparams.optimizer_momentum_momentum = 0.9 hparams.optimizer_momentum_nesterov = True hparams.weight_decay = 1e-4 hparams.clip_grad_norm = 0.0 # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.) hparams.learning_rate = 0.4 hparams.learning_rate_decay_scheme = "cosine" # For image_imagenet224, 120k training steps, which effectively makes this a # cosine decay (i.e. no cycles). hparams.learning_rate_cosine_cycle_steps = 120000 hparams.batch_size = 128 return hparams
def bluenet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 4096 hparams.hidden_size = 256 hparams.dropout = 0.2 hparams.symbol_dropout = 0.5 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 8 hparams.kernel_height = 3 hparams.kernel_width = 3 hparams.learning_rate_decay_scheme = "exp10k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("anneal_until", 40000) hparams.add_hparam("batch_deviation_loss_factor", 5.0) return hparams
def testSymbolModalityTargetsFactored(self): batch_size = 10 num_datashards = 5 length = 6 height = 7 hidden_size = 9 vocab_size = 11 model_hparams = common_hparams.basic_params1() model_hparams.factored_logits = True model_hparams.hidden_size = hidden_size model_hparams.mode = tf.estimator.ModeKeys.TRAIN body_output = -1 + np.random.random_integers( 100, size=(batch_size, length, height, hidden_size)) targets = -1 + np.random.random_integers( vocab_size, size=(batch_size, length, height, 1)) m = modalities.SymbolModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ["/device:CPU:0"] * num_datashards) with self.test_session() as session: sharded_body_output = tf.split(tf.to_float(body_output), num_datashards) sharded_targets = tf.split(targets, num_datashards) sharded_logits = m.top_sharded(sharded_body_output, sharded_targets, data_parallelism) train_loss = m.loss_sharded(sharded_logits, sharded_targets, data_parallelism) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res1, res2 = session.run((logits, train_loss)) self.assertEqual(res1.shape, (batch_size, length, height, 1, vocab_size)) self.assertEqual(res2.shape, ())
def testSymbolTupleModalityInputs(self): """Adapted from tensor2tensor/layers/modalities_test.py.""" batch_size = 10 num_datashards = 5 length = 5 vocab_size = [2000, 500, 2500] hidden_size = 9 model_hparams = common_hparams.basic_params1() model_hparams.hidden_size = hidden_size model_hparams.mode = tf.estimator.ModeKeys.TRAIN x = np.stack([ -1 + np.random.random_integers( vocab_size[i], size=(batch_size, length, 1)) for i in range(len(vocab_size)) ], axis=3) m = modalities.SymbolTupleModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ['/device:CPU:0'] * num_datashards) with self.test_session() as session: xs = tf.split(x, num_datashards) sharded_output = m.bottom_sharded(xs, data_parallelism) output = tf.concat(sharded_output, 0) session.run(tf.global_variables_initializer()) res = session.run(output) self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))
def shakeshake_cifar10(): """Parameters for CIFAR-10.""" tf.logging.warning("shakeshake_cifar10 hparams have not been verified to " "achieve good performance.") hparams = common_hparams.basic_params1() # This leads to effective batch size 128 when number of GPUs is 1 hparams.batch_size = 4096 * 8 hparams.hidden_size = 16 hparams.dropout = 0 hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 26 hparams.kernel_height = -1 # Unused hparams.kernel_width = -1 # Unused hparams.learning_rate_decay_scheme = "cosine" # Model should be run for 700000 steps with batch size 128 (~1800 epochs) hparams.learning_rate_cosine_cycle_steps = 700000 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 3000 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 # TODO(rshin): Adjust so that effective value becomes ~1e-4 hparams.weight_decay = 3.0 hparams.optimizer = "Momentum" hparams.optimizer_momentum_momentum = 0.9 hparams.add_hparam("base_filters", 16) hparams.add_hparam("shakeshake_type", "batch") return hparams
def next_frame_base(): """Common HParams for next_frame models.""" hparams = common_hparams.basic_params1() # Loss cutoff. hparams.add_hparam("video_modality_loss_cutoff", 0.01) # Additional resizing the frames before feeding them to model. hparams.add_hparam("preprocess_resize_frames", None) # How many data points to suffle. Ideally should be part of problem not model! hparams.add_hparam("shuffle_buffer_size", 128) # Tiny mode. For faster tests. hparams.add_hparam("tiny_mode", False) # In case a model supports smaller/faster version. hparams.add_hparam("small_mode", False) # In case a model has stochastic version. hparams.add_hparam("stochastic_model", False) # Internal loss for recurrent models. hparams.add_hparam("internal_loss", True) # choose from: concat, multiplicative, multi_additive hparams.add_hparam("action_injection", "multi_additive") # Scheduled sampling method. Choose between # ground_truth_only, prediction_only, prob, count, prob_inverse_exp. hparams.add_hparam("scheduled_sampling_mode", "prediction_only") hparams.add_hparam("scheduled_sampling_decay_steps", 10000) hparams.add_hparam("scheduled_sampling_max_prob", 1.0) hparams.add_hparam("scheduled_sampling_k", 900.0) return hparams
def my_very_own_hparams(): # Start with the base set hp = common_hparams.basic_params1() # Modify existing hparams hp.num_hidden_layers = 2 # Add new hparams hp.add_hparam("filter_size", 2048) return hp
def mtf_transformer2_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.add_hparam("d_model", 1024) hparams.batch_size = 4 hparams.max_length = 1024 hparams.label_smoothing = 0.0 # a small positive value - this seems important for stability when training # with bfloat16 activations. hparams.add_hparam("z_loss", 1e-4) # These hyperparameters are used in default_layer_stack() # They may not be respected if hparams uses a differet layer stack function. hparams.num_hidden_layers = 6 hparams.add_hparam("d_ff", 2048) hparams.add_hparam("d_kv", 128) hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.layer_prepostprocess_dropout = 0.0 # round up vocab sizes to be a multiple of this value hparams.vocab_divisor = 128 hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay*linear_decay" hparams.learning_rate_warmup_steps = 10000 hparams.add_hparam("master_dtype", "bfloat16") hparams.add_hparam("slice_dtype", "float32") hparams.activation_dtype = "bfloat16" # 8-way model-parallelism hparams.add_hparam("mesh_shape", "model:8") hparams.add_hparam("layout", "batch:batch;vocab:model;d_ff:model;heads:model") # If nonzero, we split the batch across two tensor-dimensions named # "outer_batch" and "inner_batch", allowing for splitting across two mesh # dimensions. This is necessary for hierarchical mixture of experts. # The two tensor dimensions have sizes hparams.outer_batch_size and # hparams.batch_size // hparams.outer_batch_size. hparams.add_hparam("outer_batch_size", 0) hparams.shared_embedding_and_softmax_weights = False # length for training or decoding - defaults to max_length hparams.add_hparam("length", 0) # These parameters make Transformer model compatible with mtf # Do not override these. hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.add_hparam("mtf_mode", True) hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.modality = { "inputs": modalities.IdentitySymbolModality, "targets": modalities.IdentitySymbolModality, } return hparams
def lstm2(): """Hparams for minimal example, copied from T2T LSTM hparams.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 # uncomment this line to fix things # hparams.initializer = "uniform_unit_scaling" return hparams
def glow_hparams(): """Glow Hparams.""" hparams = common_hparams.basic_params1() hparams.add_hparam("n_levels", 3) hparams.add_hparam("n_bits_x", 8) hparams.add_hparam("depth", 32) hparams.add_hparam("affine_coupling_width", 512) hparams.add_hparam("learn_prior", True) return hparams
def resnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("use_nchw", True) hparams.add_hparam("num_filters", [64, 128, 256, 512]) hparams.add_hparam("strides", [1, 2, 2, 2]) hparams.tpu_batch_size_per_shard = 48 return hparams
def lstm_literature_base(): """Set of base hyperparameters for LSTM from Jozefowicz et al.""" hparams = common_hparams.basic_params1() hparams.clip_grad_norm = 1.0 hparams.label_smoothing = 0.0 hparams.batch_size = 2048 hparams.optimizer = "Adagrad" hparams.learning_rate = 0.2 return hparams
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(True) # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" hparams.add_hparam("filter_size", 2048) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") # Other attention types params hparams.add_hparam("attention_loc_block_length", 256) hparams.add_hparam("attention_red_factor", 3) hparams.add_hparam("attention_red_type", "conv") hparams.add_hparam("attention_red_nonlinearity", "none") # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
def mtf_transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.add_hparam("mtf_mode", True) hparams.batch_size = 64 hparams.max_length = 256 hparams.add_hparam("d_model", 512) hparams.add_hparam("d_kv", 128) hparams.label_smoothing = 0.1 # 8-way model-parallelism hparams.add_hparam("mesh_shape", "model:8") hparams.add_hparam("layout", "batch:batch;vocab:model;d_ff:model;heads:model") hparams.add_hparam("num_heads", 8) hparams.add_hparam("d_ff", 2048) hparams.add_hparam("num_encoder_layers", 6) hparams.add_hparam("num_decoder_layers", 6) hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("relu_dropout", 0.1) hparams.layer_prepostprocess_dropout = 0.1 # round up vocab sizes to be a multiple of this value hparams.vocab_divisor = 128 # mixture of experts hparams hparams.add_hparam("feedforward_layer", "dense_relu_dense") hparams.add_hparam("moe_overhead_train", 1.0) hparams.add_hparam("moe_overhead_eval", 2.0) hparams.moe_num_experts = 16 hparams.moe_loss_coef = 1e-3 # Use targets_embedding_var * rsqrt(d_model) as softmax_var hparams.shared_embedding_and_softmax_weights = True # Reuse targets_embedding_var as inputs_embedding_var hparams.shared_embedding = True hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "linear_warmup*rsqrt_decay*linear_decay" hparams.learning_rate_warmup_steps = 10000 hparams.activation_dtype = "float32" # These parameters make Transformer model compatible with MtfTransformer # Do not override these, as mtf_transformer does not support other options. hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.target_modality = "symbol:identity" hparams.input_modalities = "inputs:symbol:identity" # Parameters for computing the maximum decode length in beam search. # Maximum decode length is: # min(max_length, # decode_length_multiplier * input_length + decode_length_constant) hparams.add_hparam("decode_length_multiplier", 1.5) hparams.add_hparam("decode_length_constant", 10.0) return hparams
def long_answer_base(): """Set of hyperparameters. Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 8192 hparams.dropout = 0.0 hparams.batching_mantissa_bits = 3 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 1000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(True) hparams.sampling_method = "random" hparams.add_hparam("filter_size", 2048) # Add new ones like this. # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 64) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("block_length", 512) hparams.add_hparam("answer_length_prob_train", 0.5) hparams.add_hparam("answer_length_infer", 1000) # We cannot handle long sequence at this point, so drop them, during eval. # This affects evaluation metrics. # TODO(noam): find a different workaround hparams.eval_drop_long_sequences = int(True) return hparams
def lstm_attention(): """hparams for LSTM with attention.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 # Attention hparams.add_hparam("attn_vec_size", hparams.hidden_size) return hparams
def lstm_seq2seq(): """hparams for LSTM.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 return hparams
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(True) hparams.add_hparam("filter_size", 2048) # Add new ones like this. # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) # FLAGS RELATED TO MIXTURE-OF-EXPERTS # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers_encoder", "2") hparams.add_hparam("moe_layers_decoder", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 32) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) return hparams
def resnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("use_nchw", True) hparams.add_hparam("num_filters", [64, 128, 256, 512]) hparams.add_hparam("strides", [1, 2, 2, 2]) # Can run with a batch size of 128 with Problem ImageImagenet224 hparams.tpu_batch_size_per_shard = 128 return hparams
def mtf_resnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.batch_size = 32 hparams.max_length = 3072 hparams.hidden_size = 256 hparams.label_smoothing = 0.0 # 8-way model-parallelism hparams.add_hparam("mesh_shape", "batch:8") hparams.add_hparam("layout", "batch:batch") hparams.add_hparam("num_heads", 8) hparams.add_hparam("filter_size", 1024) hparams.add_hparam("num_layers", 6) hparams.add_hparam("attention_key_size", 256) hparams.add_hparam("attention_value_size", 256) # Share weights between input and target embeddings hparams.shared_embedding = True # mixture of experts hparams hparams.add_hparam("ffn_layer", "dense_relu_dense") hparams.add_hparam("moe_overhead_train", 1.0) hparams.add_hparam("moe_overhead_eval", 2.0) hparams.moe_num_experts = 16 hparams.moe_loss_coef = 1e-3 hparams.shared_embedding_and_softmax_weights = True hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 hparams.add_hparam("d_kv", 32) # Image related hparams hparams.add_hparam("img_len", 32) hparams.add_hparam("num_channels", 3) hparams.add_hparam("row_blocks", 1) hparams.add_hparam("col_blocks", 1) hparams.add_hparam("rows_size", 32) hparams.add_hparam("cols_size", 32) # Model-specific parameters hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512]) hparams.add_hparam("is_cifar", False) # Variable init hparams.initializer = "normal_unit_scaling" hparams.initializer_gain = 2. # TODO(nikip): Change optimization scheme? hparams.learning_rate = 0.4 return hparams
def attention_lm_moe_base(): """Set of hyperparameters. suitable for 1 gpu. on lm1b_32k: ~229M params 0.9 steps/sec on [GeForce GTX TITAN X] Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 2048) # Add new ones like this. hparams.moe_num_experts = 32 # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("moe_layers", "2") # comma separated list of layer numbers # moe params. local attention moe. hparams.add_hparam("attention_type", AttentionType.MULTIHEAD) hparams.add_hparam("attention_num_experts", 16) # Key, query and value dimensions for the attention hparams.add_hparam("attention_kq_size", 128) hparams.add_hparam("attention_v_size", 256) # Loss coef for load balancing hparams.add_hparam("attention_load_balance", 2e-2) hparams.add_hparam("diet_experts", int(False)) hparams.add_hparam("memory_efficient_ffn", int(False)) return hparams
def lstm_seq2seq(): """hparams for LSTM.""" hparams = common_hparams.basic_params1() hparams.daisy_chain_variables = False hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 return hparams
def resnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("use_nchw", True) hparams.add_hparam("num_filters", [64, 128, 256, 512]) hparams.add_hparam("strides", [1, 2, 2, 2]) # Can run with a batch size of 128 with Problem ImageImagenet224 hparams.tpu_batch_size_per_shard = 128 return hparams
def transformer_layerbylayer_default(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 #hparams.batch_size = 4096 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 #hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.clip_grad_norm = 5. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 #hparams.shared_embedding_and_softmax_weights = int(True) hparams.shared_embedding_and_softmax_weights = int(False) # Add new ones like this. hparams.add_hparam("filter_size", 2048) # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers. hparams.add_hparam("num_encoder_layers", 0) hparams.add_hparam("num_decoder_layers", 0) # Attention-related flags. hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "conv_hidden_relu") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) hparams.add_hparam("use_pad_remover", int(True)) hparams.add_hparam("self_attention_type", "dot_product") hparams.add_hparam("max_relative_position", 0) # Layerbylayer defaults hparams.add_hparam("target_root_attention", "pop") hparams.add_hparam("use_loss_mask", int(True)) hparams.add_hparam("target_root_input", "each") # 'each', 'first', 'last' return hparams
def vanilla_gan(): """Basic parameters for a vanilla_gan.""" hparams = common_hparams.basic_params1() hparams.batch_size = 32 hparams.label_smoothing = 0.0 hparams.add_hparam("hidden_dim", 128) hparams.add_hparam("random_sample_size", 100) hparams.add_hparam("height", 28) hparams.add_hparam("width", 28) hparams.add_hparam("epsilon", 1e-4) return hparams
def basic_fc_small(): """Small fully connected model.""" hparams = common_hparams.basic_params1() hparams.learning_rate = 0.1 hparams.batch_size = 128 hparams.hidden_size = 256 hparams.num_hidden_layers = 2 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.dropout = 0.0 return hparams
def gene_expression_conv_base(): """Hparams for GeneExpressionConv model.""" hparams = common_hparams.basic_params1() hparams.add_hparam("num_conv_layers", 4) hparams.add_hparam("num_dconv_layers", 7) hparams.add_hparam("pooling_windows", [2, 4, 4, 4]) # TODO(rsepassi): Correct the values of these hyperparameters hparams.hidden_size = 128 hparams.kernel_width = 128 hparams.add_hparam("stride", 1) return hparams
def hparams_set_up(problem_name, data_dir, hparam_set=None, hparams_override=None): if hparam_set: hparams = trainer_lib.create_hparams( hparam_set, hparams_overrides_str=hparams_override) else: hparams = common_hparams.basic_params1() hparams.data_dir = data_dir hparams_lib.add_problem_hparams(hparams, problem_name) return hparams, hparams.problem
def vanilla_gan(): """Basic parameters for a vanilla_gan.""" hparams = common_hparams.basic_params1() hparams.batch_size = 32 hparams.label_smoothing = 0.0 hparams.add_hparam("hidden_dim", 128) hparams.add_hparam("random_sample_size", 100) hparams.add_hparam("height", 28) hparams.add_hparam("width", 28) hparams.add_hparam("epsilon", 1e-4) return hparams
def vanilla_gan(): """Basic parameters for a vanilla_gan.""" hparams = common_hparams.basic_params1() hparams.label_smoothing = 0.0 hparams.hidden_size = 128 hparams.batch_size = 64 hparams.add_hparam("z_size", 64) hparams.add_hparam("c_dim", 1) hparams.add_hparam("height", 28) hparams.add_hparam("width", 28) hparams.add_hparam("discriminator_batchnorm", int(True)) return hparams
def basic_fc_small(): """Small fully connected model.""" hparams = common_hparams.basic_params1() hparams.learning_rate = 0.1 hparams.batch_size = 128 hparams.hidden_size = 256 hparams.num_hidden_layers = 2 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.dropout = 0.0 return hparams
def attention_lm_moe_base(): """Set of hyperparameters. suitable for 1 gpu. on lm1b_32k: ~229M params 0.9 steps/sec on [GeForce GTX TITAN X] Returns: a hparams object """ hparams = common_hparams.basic_params1() hparams.hidden_size = 1024 hparams.batch_size = 8192 hparams.max_length = 256 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 4 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = int(False) hparams.add_hparam("filter_size", 2048) # Add new ones like this. # comma-separated list of layer numbers. # At each of these layers, we replace the ffn with a mixture of experts. hparams.add_hparam("moe_layers", "2") # If moe_n2 is None, then use a flat MoE with moe_n1 experts. # If moe_n2 is an integer, then use a hierarchical MoE # consisting of moe_n1 groups of moe_n2 experts each. hparams.add_hparam("moe_n1", 32) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_hidden_size", 2048) hparams.add_hparam("moe_loss_coef", 1e-2) # attention-related flags hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none return hparams
def mtf_toy_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.add_hparam("mtf_mode", True) hparams.batch_size = 64 set_adafactor_optimizer(hparams) hparams.add_hparam("io_size", 32) hparams.hidden_size = 32 hparams.add_hparam("mesh_shape", "4.2") hparams.add_hparam("layout", "batch:0;hidden:1") return hparams
def glow_hparams(): """Glow Hparams.""" hparams = common_hparams.basic_params1() hparams.clip_grad_norm = None hparams.weight_decay = 0.0 hparams.learning_rate_constant = 3e-4 hparams.batch_size = 32 hparams.add_hparam("n_levels", 3) hparams.add_hparam("n_bits_x", 8) hparams.add_hparam("depth", 32) hparams.add_hparam("affine_coupling_width", 512) hparams.add_hparam("learn_prior", True) return hparams
def lstm_attention_my(): """hparams for LSTM with attention.""" hparams = common_hparams.basic_params1() hparams.batch_size = 512 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 hparams.max_length = 100 hparams.dropout=0.8 hparams.learning_rate = 0.001 # Attention hparams.add_hparam("attn_vec_size", hparams.hidden_size) return hparams
def continuous_autoencoder_basic(): """Basic autoencoder model.""" hparams = common_hparams.basic_params1() hparams.optimizer = "adam" hparams.learning_rate_constant = 0.0002 hparams.learning_rate_warmup_steps = 500 hparams.learning_rate_schedule = "constant * linear_warmup" hparams.label_smoothing = 0.0 hparams.batch_size = 128 hparams.hidden_size = 64 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.kernel_height = 4 hparams.kernel_width = 4 hparams.dropout = 0.05 hparams.add_hparam("max_hidden_size", 1024) hparams.add_hparam("bottleneck_bits", 128) hparams.add_hparam("bottleneck_shared_bits", 0) hparams.add_hparam("bottleneck_shared_bits_start_warmup", 0) hparams.add_hparam("bottleneck_shared_bits_stop_warmup", 0) hparams.add_hparam("bottleneck_noise", 0.1) hparams.add_hparam("bottleneck_warmup_steps", 2000) hparams.add_hparam("sample_height", 32) hparams.add_hparam("sample_width", 32) hparams.add_hparam("bottleneck_l2_factor", 0.05) hparams.add_hparam("gumbel_temperature", 0.5) hparams.add_hparam("gumbel_noise_factor", 0.5) hparams.add_hparam("vq_temperature", 0.001) hparams.add_hparam("gan_loss_factor", 0.0) # hparams related to the PSF hparams.add_hparam("encode_psf", True) # Should we use the PSF at the encoder hparams.add_hparam("apply_psf", True) # Should we apply the PSF at the decoder hparams.add_hparam("psf_convolution_pad_factor", 0.) # Zero padding factor for convolution # hparams related to output apodization for Fourier purposes hparams.add_hparam("output_apodization", 8) # Number of pixels at the border affected by the apodization window hparams.add_hparam("apodization_loss", 1.0) # Factor to penalize non zero borders # hparams related to output activation hparams.add_hparam("output_activation", 'softplus') # either none or softplus # hparams related to additional regularization of the output hparams.add_hparam("total_variation_loss", 0.001) # Factor to apply to a loss penalizing the TV of the unconvolved image # hparams related to the likelihood hparams.add_hparam("likelihood_type", "Fourier") # Pixel or Fourier hparams.add_hparam("noise_rms", 0.03) # Value of noise RMS, used for diagonal likelihood return hparams
def transformer_symshard_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 256 hparams.batch_size = 2048 hparams.max_length = 0 # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.layer_prepostprocess_dropout = 0.2 hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("relu_dropout_broadcast_dims", "1") hparams.layer_prepostprocess_dropout = 0.1 hparams.layer_prepostprocess_dropout_broadcast_dims = "1" # length hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 hparams.initializer_gain = 1.0 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 # TODO(noam): use this to control sharing. We now share always hparams.shared_embedding_and_softmax_weights = True # we only want one data shard. hparams.no_data_parallelism = True # bypass the symbol modality so that we can use model parallelism. hparams.bottom = { "inputs": modalities.identity_bottom, "targets": modalities.identity_bottom, } hparams.top = { "targets": modalities.identity_top, } hparams.add_hparam("filter_size", 1280) hparams.add_hparam("mix_fraction", 0.5) # attention-related flags hparams.add_hparam("multihead_attention_num_heads", 4) hparams.add_hparam("multihead_attention_key_channels", 0) hparams.add_hparam("multihead_attention_value_channels", 0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam( "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") hparams.add_hparam( "decoder_layers", ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") # Number of model shards - each one has separate parameters. # Changing this number invalidates checkpoints. hparams.add_hparam("num_model_shards", 8) return hparams
def vqa_attention_base(): """VQA attention baseline hparams.""" hparams = common_hparams.basic_params1() hparams.batch_size = 2 hparams.use_fixed_batch_size = True, hparams.optimizer = "Adam" hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.999 hparams.optimizer_adam_epsilon = 1e-8 hparams.weight_decay = 0 hparams.clip_grad_norm = 0. hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 2. hparams.learning_rate = 0.5 hparams.learning_rate_schedule = "legacy" hparams.learning_rate_warmup_steps = 0 hparams.learning_rate_decay_scheme = "exp" hparams.learning_rate_decay_rate = 0.5 hparams.learning_rate_decay_steps = 50000 # not used hparams hparams.label_smoothing = 0. hparams.multiply_embedding_mode = "" hparams.dropout = 0.5 hparams.norm_type = "layer" hparams.layer_postprocess_sequence = "nd" hparams.layer_prepostprocess_dropout = 0.5 # add new hparams # preprocess hparams.add_hparam("resize_side", 512) hparams.add_hparam("height", 448) hparams.add_hparam("width", 448) hparams.add_hparam("distort", True) hparams.add_hparam("train_resnet", False) hparams.add_hparam("rnn_type", "lstm") hparams.add_hparam("num_rnn_layers", 1) hparams.add_hparam("max_question_length", 15) # lstm hidden size hparams.hidden_size = 512 hparams.add_hparam("attn_dim", 512) hparams.add_hparam("num_glimps", 2) hparams.add_hparam("num_mlp_layers", 1) hparams.add_hparam("mlp_dim", 1024) return hparams
def transformer_base_v1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_schedule = "legacy" hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = True hparams.symbol_modality_num_shards = 16 # Add new ones like this. hparams.add_hparam("filter_size", 2048) # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers. hparams.add_hparam("num_encoder_layers", 0) hparams.add_hparam("num_decoder_layers", 0) # Attention-related flags. hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "dense_relu_dense") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("attention_dropout_broadcast_dims", "") hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("relu_dropout_broadcast_dims", "") hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", False) hparams.add_hparam("use_pad_remover", True) hparams.add_hparam("self_attention_type", "dot_product") hparams.add_hparam("max_relative_position", 0) return hparams
def super_lm_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.moe_hidden_sizes = "512" hparams.batch_size = 16384 hparams.max_length = 0 # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.layer_prepostprocess_dropout = 0.0 hparams.symbol_dropout = 0.1 hparams.add_hparam("attention_dropout", 0.0) hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer = "Adafactor" hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 8000 hparams.initializer_gain = 1.0 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.shared_embedding_and_softmax_weights = False hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # we only want one data shard. hparams.no_data_parallelism = True # bypass the symbol modality so that we can use model parallelism. hparams.bottom = { "inputs": modalities.identity_bottom, "targets": modalities.identity_bottom, } hparams.top = { "targets": modalities.identity_top, } hparams.add_hparam("filter_size", 512) hparams.add_hparam("mix_fraction", 0.5) # attention-related flags hparams.add_hparam("multihead_attention_num_heads", 4) hparams.add_hparam("multihead_attention_key_channels", 0) hparams.add_hparam("multihead_attention_value_channels", 0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 4 + "n,ffn,d") # Number of model shards - each one has separate parameters. # Changing this number invalidates checkpoints. hparams.add_hparam("num_model_shards", 8) hparams.add_hparam("diet_experts", False) return hparams
def transformer_base_v1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_schedule = "linear_warmup_rsqrt_decay" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 6 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = True hparams.symbol_modality_num_shards = 16 # Add new ones like this. hparams.add_hparam("filter_size", 2048) # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers. hparams.add_hparam("num_encoder_layers", 0) hparams.add_hparam("num_decoder_layers", 0) # Attention-related flags. hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("ffn_layer", "dense_relu_dense") hparams.add_hparam("parameter_attention_key_channels", 0) hparams.add_hparam("parameter_attention_value_channels", 0) # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.add_hparam("attention_dropout", 0.0) hparams.add_hparam("attention_dropout_broadcast_dims", "") hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("relu_dropout_broadcast_dims", "") hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", False) hparams.add_hparam("use_pad_remover", True) hparams.add_hparam("self_attention_type", "dot_product") hparams.add_hparam("max_relative_position", 0) return hparams
def glow_hparams(): """Glow Hparams.""" hparams = common_hparams.basic_params1() hparams.clip_grad_norm = None hparams.weight_decay = 0.0 hparams.learning_rate_constant = 3e-4 hparams.batch_size = 32 # can be prev_level, prev_step or normal. # see: glow_ops.merge_level_and_latent_dist hparams.add_hparam("level_prior_scale", "prev_level") hparams.add_hparam("n_levels", 3) hparams.add_hparam("n_bits_x", 8) hparams.add_hparam("depth", 32) hparams.add_hparam("affine_coupling_width", 512) hparams.add_hparam("top_prior", "single_conv") return hparams
def transformer_symshard_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 256 hparams.batch_size = 2048 hparams.max_length = 0 # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.layer_prepostprocess_dropout = 0.2 hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("relu_dropout", 0.0) hparams.add_hparam("relu_dropout_broadcast_dims", "1") hparams.layer_prepostprocess_dropout = 0.1 hparams.layer_prepostprocess_dropout_broadcast_dims = "1" # length hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 hparams.initializer_gain = 1.0 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 # TODO(noam): use this to control sharing. We now share always hparams.shared_embedding_and_softmax_weights = True # we only want one data shard. hparams.no_data_parallelism = True # bypass the symbol modality so that we can use model parallelism. hparams.modality = { "inputs": modalities.IdentitySymbolModality, "targets": modalities.IdentitySymbolModality, } hparams.add_hparam("filter_size", 1280) hparams.add_hparam("mix_fraction", 0.5) # attention-related flags hparams.add_hparam("multihead_attention_num_heads", 4) hparams.add_hparam("multihead_attention_key_channels", 0) hparams.add_hparam("multihead_attention_value_channels", 0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam( "encoder_layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") hparams.add_hparam( "decoder_layers", ("n,att,m,d,a," "n,enc-att,m,d,a," "n,ffn,m,d,a,") * 6 + "n,d") # Number of model shards - each one has separate parameters. # Changing this number invalidates checkpoints. hparams.add_hparam("num_model_shards", 8) return hparams
def mtf_image_transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.batch_size = 1 hparams.max_length = 3072 hparams.hidden_size = 256 hparams.label_smoothing = 0.0 # 8-way model-parallelism hparams.add_hparam("mesh_shape", "batch:8") hparams.add_hparam("layout", "batch:batch") hparams.add_hparam("mtf_mode", True) hparams.add_hparam("num_heads", 8) hparams.add_hparam("filter_size", 1024) hparams.add_hparam("num_encoder_layers", 0) hparams.add_hparam("num_decoder_layers", 6) hparams.add_hparam("attention_key_size", 256) hparams.add_hparam("attention_value_size", 256) # Share weights between input and target embeddings hparams.shared_embedding = True # mixture of experts hparams hparams.add_hparam("ffn_layer", "dense_relu_dense") hparams.add_hparam("moe_overhead_train", 1.0) hparams.add_hparam("moe_overhead_eval", 2.0) hparams.moe_num_experts = 16 hparams.moe_loss_coef = 1e-3 hparams.shared_embedding_and_softmax_weights = True hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 hparams.add_hparam("d_kv", 64) hparams.add_hparam("d_ff", 2048) # Image related hparams hparams.add_hparam("img_len", 32) hparams.add_hparam("num_channels", 3) hparams.add_hparam("unconditional", True) # Local Attention related params hparams.add_hparam("block_length", 128) hparams.add_hparam("block_height", 16) hparams.add_hparam("block_width", 16) hparams.add_hparam("attention_type", "local1d") return hparams
def revnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.add_hparam('num_channels_first', [64, 128, 256, 416]) hparams.add_hparam('num_channels_second', [256, 512, 1024, 1664]) hparams.add_hparam('num_layers_per_block', [1, 1, 10, 1]) hparams.add_hparam('first_batch_norm', [False, True, True, True]) hparams.add_hparam('strides', [1, 2, 2, 2]) hparams.add_hparam('num_channels_init_block', 32) hparams.add_hparam('dim', '2d') hparams.optimizer = 'Momentum' hparams.learning_rate = 0.01 hparams.weight_decay = 1e-4 # Can run with a batch size of 128 with Problem ImageImagenet224 hparams.tpu_batch_size_per_shard = 128 return hparams
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
def multimodel_base(): """Base parameters for MultiModel.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.batch_size = 2048 hparams.num_hidden_layers = 4 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 4000 hparams.initializer_gain = 1.0 hparams.dropout = 0.1 hparams.add_hparam("filter_size", 2048) # Add new ones like this. hparams.add_hparam("large_kernel_size", 15) hparams.add_hparam("attention_dropout", 0.1) hparams.add_hparam("num_heads", 8) hparams.add_hparam("moe_layers", "2") hparams.moe_num_experts = 30 return hparams
def mtf_resnet_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.no_data_parallelism = True hparams.use_fixed_batch_size = True hparams.batch_size = 32 hparams.max_length = 3072 hparams.hidden_size = 256 hparams.label_smoothing = 0.0 # 8-way model-parallelism hparams.add_hparam("mesh_shape", "batch:8") hparams.add_hparam("layout", "batch:batch") hparams.add_hparam("filter_size", 1024) hparams.add_hparam("num_layers", 6) # Share weights between input and target embeddings hparams.shared_embedding = True hparams.shared_embedding_and_softmax_weights = True hparams.optimizer = "Adafactor" hparams.learning_rate_schedule = "rsqrt_decay" hparams.learning_rate_warmup_steps = 10000 hparams.add_hparam("d_kv", 32) # Image related hparams hparams.add_hparam("img_len", 32) hparams.add_hparam("num_channels", 3) hparams.add_hparam("row_blocks", 1) hparams.add_hparam("col_blocks", 1) hparams.add_hparam("rows_size", 32) hparams.add_hparam("cols_size", 32) # Model-specific parameters hparams.add_hparam("layer_sizes", [3, 4, 6, 3]) hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512]) hparams.add_hparam("is_cifar", False) # Variable init hparams.initializer = "normal_unit_scaling" hparams.initializer_gain = 2. # TODO(nikip): Change optimization scheme? hparams.learning_rate = 0.1 return hparams
def sliced_gan(): """Basic parameters for a vanilla_gan.""" hparams = common_hparams.basic_params1() hparams.optimizer = "Adam" hparams.learning_rate_constant = 0.0002 hparams.learning_rate_warmup_steps = 500 hparams.learning_rate_schedule = "constant * linear_warmup" hparams.label_smoothing = 0.0 hparams.batch_size = 128 hparams.hidden_size = 128 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 1e-6 hparams.kernel_height = 4 hparams.kernel_width = 4 hparams.bottleneck_bits = 128 hparams.add_hparam("discriminator_batchnorm", True) hparams.add_hparam("num_sliced_vecs", 4096) return hparams
def autoencoder_basic(): """Basic autoencoder model.""" hparams = common_hparams.basic_params1() hparams.optimizer = "Adam" hparams.learning_rate_constant = 0.0002 hparams.learning_rate_warmup_steps = 500 hparams.learning_rate_schedule = "constant * linear_warmup" hparams.label_smoothing = 0.0 hparams.batch_size = 128 hparams.hidden_size = 64 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.initializer_gain = 1.0 hparams.weight_decay = 0.0 hparams.kernel_height = 4 hparams.kernel_width = 4 hparams.dropout = 0.05 hparams.add_hparam("max_hidden_size", 1024) hparams.add_hparam("bottleneck_bits", 128) hparams.add_hparam("bottleneck_shared_bits", 0) hparams.add_hparam("bottleneck_shared_bits_start_warmup", 0) hparams.add_hparam("bottleneck_shared_bits_stop_warmup", 0) hparams.add_hparam("bottleneck_noise", 0.1) hparams.add_hparam("bottleneck_warmup_steps", 2000) hparams.add_hparam("sample_height", 32) hparams.add_hparam("sample_width", 32) hparams.add_hparam("discriminator_batchnorm", True) hparams.add_hparam("num_sliced_vecs", 20000) hparams.add_hparam("sliced_do_tanh", int(True)) hparams.add_hparam("discriminator_size", 256) hparams.add_hparam("discriminator_kernel_size", 6) hparams.add_hparam("discriminator_strides", 4) hparams.add_hparam("discriminator_pure_mean", int(False)) hparams.add_hparam("code_loss_factor", 1.0) hparams.add_hparam("gan_codes_warmup_steps", 16000) hparams.add_hparam("gan_loss_factor", 0.0) hparams.add_hparam("bottleneck_l2_factor", 0.05) hparams.add_hparam("gumbel_temperature", 0.5) hparams.add_hparam("gumbel_noise_factor", 0.5) hparams.add_hparam("vq_temperature", 0.001) hparams.add_hparam("use_vq_loss", int(False)) hparams.add_hparam("discriminator", "double") return hparams
def super_lm_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.hidden_size = 512 hparams.moe_hidden_sizes = "512" hparams.batch_size = 16384 hparams.max_length = 0 # All hyperparameters ending in "dropout" are automatically set to 0.0 # when not in training mode. hparams.layer_prepostprocess_dropout = 0.0 hparams.symbol_dropout = 0.1 hparams.add_hparam("attention_dropout", 0.0) hparams.label_smoothing = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer = "Adafactor" hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 8000 hparams.initializer_gain = 1.0 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.shared_embedding_and_softmax_weights = False hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # we only want one data shard. hparams.no_data_parallelism = True # bypass the symbol modality so that we can use model parallelism. hparams.target_modality = "symbol:identity" hparams.add_hparam("filter_size", 512) hparams.add_hparam("mix_fraction", 0.5) # attention-related flags hparams.add_hparam("multihead_attention_num_heads", 4) hparams.add_hparam("multihead_attention_key_channels", 0) hparams.add_hparam("multihead_attention_value_channels", 0) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam( "layers", ("n,att,m,d,a," "n,ffn,m,d,a,") * 4 + "n,ffn,d") # Number of model shards - each one has separate parameters. # Changing this number invalidates checkpoints. hparams.add_hparam("num_model_shards", 8) hparams.add_hparam("diet_experts", False) return hparams
def testSymbolModalityInputs(self): batch_size = 10 num_datashards = 5 length = 5 vocab_size = 5000 hidden_size = 9 model_hparams = common_hparams.basic_params1() model_hparams.hidden_size = hidden_size model_hparams.mode = tf.estimator.ModeKeys.TRAIN x = -1 + np.random.random_integers( vocab_size, size=(batch_size, length, 1, 1)) m = modalities.SymbolModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ["/device:CPU:0"] * num_datashards) xs = tf.split(x, num_datashards) sharded_output = m.bottom_sharded(xs, data_parallelism) output = tf.concat(sharded_output, 0) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(output) self.assertEqual(res.shape, (batch_size, length, 1, hidden_size))
def ppo_base_v1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.learning_rate = 1e-4 hparams.add_hparam("init_mean_factor", 0.1) hparams.add_hparam("init_logstd", 0.1) hparams.add_hparam("policy_layers", (100, 100)) hparams.add_hparam("value_layers", (100, 100)) hparams.add_hparam("num_agents", 30) hparams.add_hparam("clipping_coef", 0.2) hparams.add_hparam("gae_gamma", 0.99) hparams.add_hparam("gae_lambda", 0.95) hparams.add_hparam("entropy_loss_coef", 0.01) hparams.add_hparam("value_loss_coef", 1) hparams.add_hparam("optimization_epochs", 15) hparams.add_hparam("epoch_length", 200) hparams.add_hparam("epochs_num", 2000) hparams.add_hparam("eval_every_epochs", 10) hparams.add_hparam("num_eval_agents", 3) hparams.add_hparam("video_during_eval", True) return hparams
def gene_expression_conv_base(): """Hparams for GeneExpressionConv model.""" hparams = common_hparams.basic_params1() batch_size = 10 output_length = 2048 inputs_per_output = 128 chunk_size = 4 input_length = output_length * inputs_per_output // chunk_size hparams.batch_size = input_length * batch_size hparams.dropout = 0.1 hparams.add_hparam("num_conv_layers", 4) hparams.add_hparam("num_dconv_layers", 7) # The product of these pooling windows should match # input_length/target_length. hparams.add_hparam("pooling_windows", [2, 2, 2, 4]) hparams.hidden_size = 256 hparams.kernel_width = 20 hparams.add_hparam("stride", 1) return hparams