class ProjectAggregateCombinerConfig(BaseCombinerConfig): projection_size: int = schema_utils.PositiveInteger( default=128, description= "All combiner inputs are projected to this size before being aggregated." ) fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description= "Full secification of the fully connected layers after the aggregation. " "It should be a list of dict, each disct representing one layer.") num_fc_layers: int = schema_utils.NonNegativeInteger( default=2, description="Number of fully connected layers after aggregation.") output_size: int = schema_utils.PositiveInteger( default=128, description= "Output size of each layer of the stack of fully connected layers.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layers use a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description= "Initializer to use for the weights of the projection and for the fully connected layers.", ) bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description= "Initializer to use for the baias of the projection and for the fully connected layers.", ) norm: Optional[str] = schema_utils.StringOptions( ["batch", "layer"], default="layer", description= "Normalization to apply to each projection and fully connected layer.", ) norm_params: Optional[dict] = schema_utils.Dict( description= "Parameters of the normalization to apply to each projection and fully connected layer." ) activation: str = schema_utils.ActivationOptions( default="relu", description="Activation to apply to each fully connected layer.") dropout: float = schema_utils.FloatRange( default=0.0, min=0, max=1, description="Dropout rate to apply to each fully connected layer.") residual: bool = schema_utils.Boolean( default=True, description= "Whether to add residual skip connection between the fully connected layers in the stack..", )
class ConcatCombinerConfig(BaseCombinerConfig): """Parameters for concat combiner.""" fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") num_fc_layers: int = schema_utils.NonNegativeInteger(default=0, description="") output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") activation: str = schema_utils.ActivationOptions(default="relu", description="") dropout: float = schema_utils.FloatRange(default=0.0, min=0, max=1, description="") flatten_inputs: bool = schema_utils.Boolean( default=False, description="Whether to flatten input tensors to a vector.") residual: bool = schema_utils.Boolean( default=False, description= ("Whether to add a residual connection to each fully connected layer block. All fully connected layers must" " have the same size"), )
class ComparatorCombinerConfig(BaseCombinerConfig): """Parameters for comparator combiner.""" entity_1: List[str] """TODO: Document parameters.""" entity_2: List[str] """TODO: Document parameters.""" fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") num_fc_layers: int = schema_utils.NonNegativeInteger(default=1, description="") output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") activation: str = schema_utils.ActivationOptions(default="relu", description="") dropout: float = schema_utils.FloatRange( default=0.0, min=0, max=1, description="Dropout rate for the transformer block.")
class ECDTrainerConfig(BaseTrainerConfig): """Dataclass that configures most of the hyperparameters used for ECD model training.""" type: str = schema_utils.StringOptions( ["trainer", "ray_legacy_trainer"], default="trainer", description=( "Trainer to use for training the model. Must be one of ['trainer', 'ray_legacy_trainer'] - " "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` (default: 'trainer')" ), allow_none=False, ) optimizer: BaseOptimizerConfig = OptimizerDataclassField( default={"type": "adam"}, description="Parameter values for selected torch optimizer." ) epochs: int = schema_utils.PositiveInteger( default=100, description="Number of epochs the algorithm is intended to be run over.", parameter_metadata=TRAINER_METADATA["epochs"], ) train_steps: int = schema_utils.PositiveInteger( default=None, allow_none=True, description=( "Maximum number of training steps the algorithm is intended to be run over. " + "If unset, then `epochs` is used to determine training length." ), parameter_metadata=TRAINER_METADATA["train_steps"], ) regularization_lambda: float = schema_utils.FloatRange( default=0.0, min=0, description="Strength of the $L2$ regularization.", parameter_metadata=TRAINER_METADATA["regularization_lambda"], ) regularization_type: Optional[str] = schema_utils.RegularizerOptions( default="l2", description="Type of regularization." ) should_shuffle: bool = schema_utils.Boolean( default=True, description="Whether to shuffle batches during training when true.", parameter_metadata=TRAINER_METADATA["should_shuffle"], ) batch_size: Union[int, str] = schema_utils.IntegerOrAutoField( default=128, default_numeric=128, allow_none=False, min_exclusive=0, description=( "The number of training examples utilized in one training step of the model. If ’auto’, the " "biggest batch size (power of 2) that can fit in memory will be used." ), parameter_metadata=TRAINER_METADATA["batch_size"], ) steps_per_checkpoint: int = schema_utils.NonNegativeInteger( default=0, description=( "How often the model is checkpointed. Also dictates maximum evaluation frequency. If 0 the model is " "checkpointed after every epoch." ), parameter_metadata=TRAINER_METADATA["steps_per_checkpoint"], ) checkpoints_per_epoch: int = schema_utils.NonNegativeInteger( default=0, description=( "Number of checkpoints per epoch. For example, 2 -> checkpoints are written every half of an epoch. Note " "that it is invalid to specify both non-zero `steps_per_checkpoint` and non-zero `checkpoints_per_epoch`." ), parameter_metadata=TRAINER_METADATA["checkpoints_per_epoch"], ) reduce_learning_rate_on_plateau: float = schema_utils.FloatRange( default=0.0, min=0.0, max=1.0, description=( "Reduces the learning rate when the algorithm hits a plateau (i.e. the performance on the validation does " "not improve." ), parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau"], ) reduce_learning_rate_on_plateau_patience: int = schema_utils.NonNegativeInteger( default=5, description="How many epochs have to pass before the learning rate reduces.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_patience"], ) reduce_learning_rate_on_plateau_rate: float = schema_utils.FloatRange( default=0.5, min=0.0, max=1.0, description="Rate at which we reduce the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_rate"], ) reduce_learning_rate_eval_metric: str = schema_utils.String( default=LOSS, description="Rate at which we reduce the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_metric"], ) reduce_learning_rate_eval_split: str = schema_utils.String( default=TRAINING, description="Which dataset split to listen on for reducing the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_split"], ) increase_batch_size_on_plateau: int = schema_utils.NonNegativeInteger( default=0, description="Number to increase the batch size by on a plateau.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau"], ) increase_batch_size_on_plateau_patience: int = schema_utils.NonNegativeInteger( default=5, description="How many epochs to wait for before increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_patience"], ) increase_batch_size_on_plateau_rate: float = schema_utils.NonNegativeFloat( default=2.0, description="Rate at which the batch size increases.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_rate"], ) increase_batch_size_on_plateau_max: int = schema_utils.PositiveInteger( default=512, description="Maximum size of the batch.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_max"], ) increase_batch_size_eval_metric: str = schema_utils.String( default=LOSS, description="Which metric to listen on for increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_metric"], ) increase_batch_size_eval_split: str = schema_utils.String( default=TRAINING, description="Which dataset split to listen on for increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_split"], ) decay: bool = schema_utils.Boolean( default=False, description="Turn on exponential decay of the learning rate.", parameter_metadata=TRAINER_METADATA["decay"], ) decay_steps: int = schema_utils.PositiveInteger( default=10000, description="The number of steps to take in the exponential learning rate decay.", parameter_metadata=TRAINER_METADATA["decay_steps"], ) decay_rate: float = schema_utils.FloatRange( default=0.96, min=0.0, max=1.0, description="Decay per epoch (%): Factor to decrease the Learning rate.", parameter_metadata=TRAINER_METADATA["decay_steps"], ) staircase: bool = schema_utils.Boolean( default=False, description="Decays the learning rate at discrete intervals.", parameter_metadata=TRAINER_METADATA["staircase"], ) gradient_clipping: Optional[GradientClippingConfig] = GradientClippingDataclassField( description="Parameter values for gradient clipping.", default={}, ) learning_rate_warmup_epochs: float = schema_utils.NonNegativeFloat( default=1.0, description="Number of epochs to warmup the learning rate for.", parameter_metadata=TRAINER_METADATA["learning_rate_warmup_epochs"], ) learning_rate_scaling: str = schema_utils.StringOptions( ["constant", "sqrt", "linear"], default="linear", description=( "Scale by which to increase the learning rate as the number of distributed workers increases. " "Traditionally the learning rate is scaled linearly with the number of workers to reflect the proportion by" " which the effective batch size is increased. For very large batch sizes, a softer square-root scale can " "sometimes lead to better model performance. If the learning rate is hand-tuned for a given number of " "workers, setting this value to constant can be used to disable scale-up." ), parameter_metadata=TRAINER_METADATA["learning_rate_scaling"], )
class GBMTrainerConfig(BaseTrainerConfig): """Dataclass that configures most of the hyperparameters used for GBM model training.""" type: str = schema_utils.StringOptions( ["lightgbm_trainer"], default="lightgbm_trainer", description=( "Trainer to use for training the model. Must be one of ['lightgbm_trainer'] - " "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` " "(default: 'lightgbm_trainer')" ), allow_none=False, ) # LightGBM core parameters (https://lightgbm.readthedocs.io/en/latest/Parameters.html) boosting_type: str = schema_utils.StringOptions( ["gbdt", "rf", "dart", "goss"], default="gbdt", description="Type of boosting algorithm to use with GBM trainer.", ) tree_learner: str = schema_utils.StringOptions( ["serial", "feature", "data", "voting"], default="serial", description="Type of tree learner to use with GBM trainer.", ) num_boost_round: int = schema_utils.PositiveInteger( default=100, description="Number of boosting rounds to perform with GBM trainer." ) num_leaves: int = schema_utils.PositiveInteger( default=31, description="Number of leaves to use in the tree with GBM trainer." ) # LightGBM Learning Control params max_depth: int = schema_utils.Integer( default=-1, description="Maximum depth of a tree in the GBM trainer. A negative value means no limit.", ) min_data_in_leaf: int = schema_utils.PositiveInteger( default=20, description="Minimum number of data points in a leaf with GBM trainer." ) min_sum_hessian_in_leaf: float = schema_utils.NonNegativeFloat( default=1e-3, description="Minimum sum of hessians in a leaf with GBM trainer." ) bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of data to use for bagging with GBM trainer." ) pos_bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of positive data to use for bagging with GBM trainer." ) neg_bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of negative data to use for bagging with GBM trainer." ) bagging_freq: int = schema_utils.NonNegativeInteger(default=0, description="Frequency of bagging with GBM trainer.") bagging_seed: int = schema_utils.Integer(default=3, description="Random seed for bagging with GBM trainer.") feature_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of features to use in the GBM trainer." ) feature_fraction_bynode: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of features to use for each tree node with GBM trainer." ) feature_fraction_seed: int = schema_utils.Integer( default=2, description="Random seed for feature fraction with GBM trainer." ) extra_trees: bool = schema_utils.Boolean( default=False, description="Whether to use extremely randomized trees in the GBM trainer." ) extra_seed: int = schema_utils.Integer( default=6, description="Random seed for extremely randomized trees in the GBM trainer." ) max_delta_step: float = schema_utils.FloatRange( default=0.0, min=0.0, max=1.0, description=( "Used to limit the max output of tree leaves in the GBM trainer. A negative value means no constraint." ), ) lambda_l1: float = schema_utils.NonNegativeFloat( default=0.0, description="L1 regularization factor for the GBM trainer." ) lambda_l2: float = schema_utils.NonNegativeFloat( default=0.0, description="L2 regularization factor for the GBM trainer." ) linear_lambda: float = schema_utils.NonNegativeFloat( default=0.0, description="Linear tree regularization in the GBM trainer." ) min_gain_to_split: float = schema_utils.NonNegativeFloat( default=0.0, description="Minimum gain to split a leaf in the GBM trainer." ) drop_rate: float = schema_utils.FloatRange( default=0.1, min=0.0, max=1.0, description="Dropout rate for the GBM trainer. Used only with boosting_type 'dart'.", ) max_drop: int = schema_utils.Integer( default=50, description=( "Maximum number of dropped trees during one boosting iteration. " "Used only with boosting_type 'dart'. A negative value means no limit." ), ) skip_drop: float = schema_utils.FloatRange( default=0.5, min=0.0, max=1.0, description=( "Probability of skipping the dropout during one boosting iteration. Used only with boosting_type 'dart'." ), ) xgboost_dart_mode: bool = schema_utils.Boolean( default=False, description="Whether to use xgboost dart mode in the GBM trainer. Used only with boosting_type 'dart'.", ) uniform_drop: bool = schema_utils.Boolean( default=False, description=("Whether to use uniform dropout in the GBM trainer. Used only with boosting_type 'dart'."), ) drop_seed: int = schema_utils.Integer( default=4, description="Random seed to choose dropping models in the GBM trainer. Used only with boosting_type 'dart'.", ) top_rate: float = schema_utils.FloatRange( default=0.2, min=0.0, max=1.0, description="The retain ratio of large gradient data in the GBM trainer. Used only with boosting_type 'goss'.", ) other_rate: float = schema_utils.FloatRange( default=0.1, min=0.0, max=1.0, description="The retain ratio of small gradient data in the GBM trainer. Used only with boosting_type 'goss'.", ) min_data_per_group: int = schema_utils.PositiveInteger( default=100, description="Minimum number of data points per categorical group for the GBM trainer.", ) max_cat_threshold: int = schema_utils.PositiveInteger( default=32, description="Number of split points considered for categorical features for the GBM trainer.", ) cat_l2: float = schema_utils.NonNegativeFloat( default=10.0, description="L2 regularization factor for categorical split in the GBM trainer." ) cat_smooth: float = schema_utils.NonNegativeFloat( default=10.0, description="Smoothing factor for categorical split in the GBM trainer." ) max_cat_to_onehot: int = schema_utils.PositiveInteger( default=4, description="Maximum categorical cardinality required before one-hot encoding in the GBM trainer.", ) cegb_tradeoff: float = schema_utils.NonNegativeFloat( default=1.0, description="Cost-effective gradient boosting multiplier for all penalties in the GBM trainer.", ) cegb_penalty_split: float = schema_utils.NonNegativeFloat( default=0.0, description="Cost-effective gradient boosting penalty for splitting a node in the GBM trainer.", ) path_smooth: float = schema_utils.NonNegativeFloat( default=0.0, description="Smoothing factor applied to tree nodes in the GBM trainer.", ) verbose: int = schema_utils.IntegerRange(default=0, min=-1, max=2, description="Verbosity level for GBM trainer.") # LightGBM IO params max_bin: int = schema_utils.PositiveInteger( default=255, description="Maximum number of bins to use for discretizing features with GBM trainer." )
class TabNetCombinerConfig(BaseCombinerConfig): """Parameters for tabnet combiner.""" size: int = schema_utils.PositiveInteger(default=32, description="`N_a` in the paper.") output_size: int = schema_utils.PositiveInteger( default=128, description="Output size of a fully connected layer. `N_d` in the paper" ) num_steps: int = schema_utils.NonNegativeInteger( default=3, description= ("Number of steps / repetitions of the the attentive transformer and feature transformer computations. " "`N_steps` in the paper"), ) num_total_blocks: int = schema_utils.NonNegativeInteger( default=4, description="Total number of feature transformer block at each step") num_shared_blocks: int = schema_utils.NonNegativeInteger( default=2, description= "Number of shared feature transformer blocks across the steps") relaxation_factor: float = schema_utils.FloatRange( default=1.5, description= ("Factor that influences how many times a feature should be used across the steps of computation. a value of" " 1 implies it each feature should be use once, a higher value allows for multiple usages. `gamma` in the " "paper"), ) bn_epsilon: float = schema_utils.FloatRange( default=1e-3, description="Epsilon to be added to the batch norm denominator.") bn_momentum: float = schema_utils.FloatRange( default=0.05, description= "Momentum of the batch norm. 1 - `m_B` from the TabNet paper.") bn_virtual_bs: Optional[int] = schema_utils.PositiveInteger( default=1024, allow_none=True, description= ("Size of the virtual batch size used by ghost batch norm. If null, regular batch norm is used instead. " "`B_v` from the TabNet paper"), ) sparsity: float = schema_utils.FloatRange( default=1e-4, description= "Multiplier of the sparsity inducing loss. `lambda_sparse` in the paper" ) entmax_mode: str = schema_utils.StringOptions( ["entmax15", "sparsemax", "constant", "adaptive"], default="sparsemax", description="") entmax_alpha: float = schema_utils.FloatRange( default=1.5, min=1, max=2, description="") # 1 corresponds to softmax, 2 is sparsemax. dropout: float = schema_utils.FloatRange( default=0.05, min=0, max=1, description="Dropout rate for the transformer block.")
class CommonTransformerConfig: """Common transformer parameter values.""" num_layers: int = schema_utils.PositiveInteger(default=1, description="") hidden_size: int = schema_utils.NonNegativeInteger( default=256, description= ("The number of hidden units of the TransformerStack as well as the dimension that each incoming input " "feature is projected to before feeding to the TransformerStack"), ) num_heads: int = schema_utils.NonNegativeInteger( default=8, description= "Number of heads of the self attention in the transformer block.") transformer_output_size: int = schema_utils.NonNegativeInteger( default=256, description= ("Size of the fully connected layer after self attention in the transformer block. This is usually the same " "as `hidden_size` and `embedding_size`."), ) dropout: float = schema_utils.FloatRange( default=0.1, min=0, max=1, description="Dropout rate for the transformer block.") fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") # TODO(#1673): Add conditional logic for fields like this one: num_fc_layers: int = schema_utils.NonNegativeInteger( default=0, description= "The number of stacked fully connected layers (only applies if `reduce_output` is not null).", ) output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") fc_activation: str = schema_utils.ActivationOptions(default="relu", description="") fc_dropout: float = schema_utils.FloatRange(default=0.0, min=0, max=1, description="") fc_residual: bool = schema_utils.Boolean(default=False, description="")