def dummy_config(model_arch): """Creates a dummy model config that can be used by all tests.""" config = base_config.get_config() config.model_arch = model_arch.name config.d_emb = 2 config.d_model = 2 config.d_ff = 2 config.max_seq_length = 4 config.num_heads = 1 config.num_layers = 2 config.vocab_size = 16 config.pad_id = 0 config.train_batch_size = 3 config.eval_batch_size = 2 config.use_fft = True config.num_experts = 2 config.num_moe_layers = 0 config.num_attention_layers = 0 config.max_group_size = 2 config.auxiliary_loss_factor = 0.01 config.router_z_loss_factor = 0.01 config.dispatch_algorithm = DispatchAlgorithm.MASK_TOKENS_CHOOSE config.dtype = jnp.float32 return config
def test_validate_incorrect_configs(self): config = default_config.get_config() config.train_batch_size = 6 config.gradient_accum_steps = 4 with self.assertRaisesRegex( ValueError, "training batch size must be divisible by gradient_accum_steps" ): train_utils.validate_config(config)
def get_config(): """Config for pre-training.""" config = base_config.get_config() # Determines which model to use. config.model_arch: str = ModelArchitecture.LINEAR.name config.mode: TrainingMode = TrainingMode.PRETRAINING # Total batch size for training. config.train_batch_size: int = 64 # Total batch size for eval. config.eval_batch_size: int = 64 # The base learning rate for Adam. config.learning_rate: float = 1e-4 # Number of training steps. config.num_train_steps: int = int(1e6) # Number of warm-up steps. We generally find that that larger models need more # warm-up steps. config.num_warmup_steps: int = int(1e4) # How often to save the model checkpoint. config.save_checkpoints_steps: int = 20000 # Frequency fo eval during training, e.g. every 2000 steps. config.eval_frequency: int = 2000 # Maximum number of eval steps. config.max_num_eval_steps: int = 1000 # Do not start from a pre-trained checkpoint. config.init_checkpoint_dir: str = "" # Maximum number of masked LM predictions per sequence. config.max_predictions_per_seq: int = 80 # Proportion of tokens for masked LM predictions. Total number of selected # tokens will be at most config.max_predictions_per_seq. config.masking_rate: float = 0.15 # Proportion of masked tokens to replace with ['MASK']. config.mask_token_proportion: float = 0.8 # Proportion of masked tokens to replace with a random token. config.random_token_proportion: float = 0.1 # Remaining 1 - config.mask_token_proportion - config.random_token_proportion # fraction of selected tokens are left as is. # Measure the step speed. config.measure_step_speed: bool = False # Dummy attribute for repeated runs. config.trial: int = 0 return config
def get_config(): """Config for fine-tuning (classification).""" config = base_config.get_config() # Determines which model to use. config.model_arch: str = ModelArchitecture.LINEAR.name config.mode: TrainingMode = TrainingMode.CLASSIFICATION # Available fine-tuning tasks are "glue/DS_g", "super_glue/DS_sg", # where "DS_g" is one of the following: # [cola, sst2, mrpc, qqp, stsb, mnli, qnli, rte], # and "DS_sg" is one of the following: # [boolq, cb, copa, multirc, record, rte, wic]. config.dataset_name: str = "glue/rte" # How often to save the model checkpoint. config.save_checkpoints_steps: int = 1000 # Training metrics will be computed (1 / eval_proportion) times during # training at regularly spaced intervals, regardless of dataset size. config.eval_proportion: float = 0.1 # Total batch size for training. config.train_batch_size: int = 64 # Total batch size for eval (and predictions). config.eval_batch_size: int = 64 # The base learning rate for Adam. config.learning_rate: float = 1e-5 # Total number of training epochs to perform. config.num_train_epochs: int = 5 # Proportion of training to perform linear learning rate warmup for. # E.g., 0.1 = 10% of training steps. config.warmup_proportion: float = 0.1 # Maximum number of eval steps on validation split. Actual number of step may # be less for small eval datasets. config.max_num_eval_steps: int = int(1e5) # For fine-tuning Mixture of Experts models, it is often beneficial to have a # larger dropout rate for the individual experts. config.expert_dropout_rate: float = 0.2 # Initial checkpoint directory or filepath (usually from a pre-trained model). config.init_checkpoint_dir: str = "" # Dummy attribute for repeated runs. config.trial: int = 0 return config
def frozen_config(sharded_params=False): """Creates a dummy model config that can be used by all tests.""" config = default_config.get_config() config.model_arch = default_config.ModelArchitecture.LINEAR.name config.num_attention_layers = 0 config.d_emb = 4 config.d_model = 4 config.d_ff = 4 config.max_seq_length = 8 config.num_layers = 1 config.vocab_size = 16 config.train_batch_size = 4 config.dtype = jnp.float32 config.pad_id = 3 # MoE layers contain sharded parameters. config.num_moe_layers = 1 if sharded_params else 0 config.num_experts = 1 if sharded_params else 0 config.auxiliary_loss_factor = 0.01 config.router_z_loss_factor = 0.01 return ml_collections.FrozenConfigDict(config)
def test_validate_correct_config(self): config = default_config.get_config() train_utils.validate_config(config)