class ProjectAggregateCombinerConfig(BaseCombinerConfig): projection_size: int = schema_utils.PositiveInteger( default=128, description= "All combiner inputs are projected to this size before being aggregated." ) fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description= "Full secification of the fully connected layers after the aggregation. " "It should be a list of dict, each disct representing one layer.") num_fc_layers: int = schema_utils.NonNegativeInteger( default=2, description="Number of fully connected layers after aggregation.") output_size: int = schema_utils.PositiveInteger( default=128, description= "Output size of each layer of the stack of fully connected layers.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layers use a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description= "Initializer to use for the weights of the projection and for the fully connected layers.", ) bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description= "Initializer to use for the baias of the projection and for the fully connected layers.", ) norm: Optional[str] = schema_utils.StringOptions( ["batch", "layer"], default="layer", description= "Normalization to apply to each projection and fully connected layer.", ) norm_params: Optional[dict] = schema_utils.Dict( description= "Parameters of the normalization to apply to each projection and fully connected layer." ) activation: str = schema_utils.ActivationOptions( default="relu", description="Activation to apply to each fully connected layer.") dropout: float = schema_utils.FloatRange( default=0.0, min=0, max=1, description="Dropout rate to apply to each fully connected layer.") residual: bool = schema_utils.Boolean( default=True, description= "Whether to add residual skip connection between the fully connected layers in the stack..", )
class ConcatCombinerConfig(BaseCombinerConfig): """Parameters for concat combiner.""" fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") num_fc_layers: int = schema_utils.NonNegativeInteger(default=0, description="") output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") activation: str = schema_utils.ActivationOptions(default="relu", description="") dropout: float = schema_utils.FloatRange(default=0.0, min=0, max=1, description="") flatten_inputs: bool = schema_utils.Boolean( default=False, description="Whether to flatten input tensors to a vector.") residual: bool = schema_utils.Boolean( default=False, description= ("Whether to add a residual connection to each fully connected layer block. All fully connected layers must" " have the same size"), )
class BagPreprocessingConfig(schema_utils.BaseMarshmallowConfig): tokenizer: str = schema_utils.StringOptions( tokenizer_registry.keys(), default="space", allow_none=False, description= "Defines how to transform the raw text content of the dataset column to a set of elements. The " "default value space splits the string on spaces. Common options include: underscore (splits on " "underscore), comma (splits on comma), json (decodes the string into a set or a list through a " "JSON parser).", ) missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default="fill_with_const", allow_none=False, description= "What strategy to follow when there's a missing value in a set column", ) fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The value to replace missing values with in case the missing_value_strategy is fill_with_const", ) computed_fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], ) lowercase: bool = schema_utils.Boolean( default=False, description= "If true, converts the string to lowercase before tokenizing.", ) most_common: int = schema_utils.PositiveInteger( default=10000, allow_none=True, description= "The maximum number of most common tokens to be considered. If the data contains more than this " "amount, the most infrequent tokens will be treated as unknown.", )
class CategoryPreprocessingConfig(schema_utils.BaseMarshmallowConfig): """CategoryPreprocessingConfig is a dataclass that configures the parameters used for a category input feature.""" missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default="fill_with_const", allow_none=False, description= "What strategy to follow when there's a missing value in a category column", ) fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The value to replace missing values with in case the missing_value_strategy is fill_with_const", ) computed_fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], ) lowercase: bool = schema_utils.Boolean( default=False, description= "Whether the string has to be lowercased before being handled by the tokenizer.", ) most_common: int = schema_utils.PositiveInteger( default=10000, allow_none=True, description= "The maximum number of most common tokens to be considered. if the data contains more than this " "amount, the most infrequent tokens will be treated as unknown.", )
class ComparatorCombinerConfig(BaseCombinerConfig): """Parameters for comparator combiner.""" entity_1: List[str] """TODO: Document parameters.""" entity_2: List[str] """TODO: Document parameters.""" fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") num_fc_layers: int = schema_utils.NonNegativeInteger(default=1, description="") output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") activation: str = schema_utils.ActivationOptions(default="relu", description="") dropout: float = schema_utils.FloatRange( default=0.0, min=0, max=1, description="Dropout rate for the transformer block.")
class ECDTrainerConfig(BaseTrainerConfig): """Dataclass that configures most of the hyperparameters used for ECD model training.""" type: str = schema_utils.StringOptions( ["trainer", "ray_legacy_trainer"], default="trainer", description=( "Trainer to use for training the model. Must be one of ['trainer', 'ray_legacy_trainer'] - " "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` (default: 'trainer')" ), allow_none=False, ) optimizer: BaseOptimizerConfig = OptimizerDataclassField( default={"type": "adam"}, description="Parameter values for selected torch optimizer." ) epochs: int = schema_utils.PositiveInteger( default=100, description="Number of epochs the algorithm is intended to be run over.", parameter_metadata=TRAINER_METADATA["epochs"], ) train_steps: int = schema_utils.PositiveInteger( default=None, allow_none=True, description=( "Maximum number of training steps the algorithm is intended to be run over. " + "If unset, then `epochs` is used to determine training length." ), parameter_metadata=TRAINER_METADATA["train_steps"], ) regularization_lambda: float = schema_utils.FloatRange( default=0.0, min=0, description="Strength of the $L2$ regularization.", parameter_metadata=TRAINER_METADATA["regularization_lambda"], ) regularization_type: Optional[str] = schema_utils.RegularizerOptions( default="l2", description="Type of regularization." ) should_shuffle: bool = schema_utils.Boolean( default=True, description="Whether to shuffle batches during training when true.", parameter_metadata=TRAINER_METADATA["should_shuffle"], ) batch_size: Union[int, str] = schema_utils.IntegerOrAutoField( default=128, default_numeric=128, allow_none=False, min_exclusive=0, description=( "The number of training examples utilized in one training step of the model. If ’auto’, the " "biggest batch size (power of 2) that can fit in memory will be used." ), parameter_metadata=TRAINER_METADATA["batch_size"], ) steps_per_checkpoint: int = schema_utils.NonNegativeInteger( default=0, description=( "How often the model is checkpointed. Also dictates maximum evaluation frequency. If 0 the model is " "checkpointed after every epoch." ), parameter_metadata=TRAINER_METADATA["steps_per_checkpoint"], ) checkpoints_per_epoch: int = schema_utils.NonNegativeInteger( default=0, description=( "Number of checkpoints per epoch. For example, 2 -> checkpoints are written every half of an epoch. Note " "that it is invalid to specify both non-zero `steps_per_checkpoint` and non-zero `checkpoints_per_epoch`." ), parameter_metadata=TRAINER_METADATA["checkpoints_per_epoch"], ) reduce_learning_rate_on_plateau: float = schema_utils.FloatRange( default=0.0, min=0.0, max=1.0, description=( "Reduces the learning rate when the algorithm hits a plateau (i.e. the performance on the validation does " "not improve." ), parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau"], ) reduce_learning_rate_on_plateau_patience: int = schema_utils.NonNegativeInteger( default=5, description="How many epochs have to pass before the learning rate reduces.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_patience"], ) reduce_learning_rate_on_plateau_rate: float = schema_utils.FloatRange( default=0.5, min=0.0, max=1.0, description="Rate at which we reduce the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_rate"], ) reduce_learning_rate_eval_metric: str = schema_utils.String( default=LOSS, description="Rate at which we reduce the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_metric"], ) reduce_learning_rate_eval_split: str = schema_utils.String( default=TRAINING, description="Which dataset split to listen on for reducing the learning rate.", parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_split"], ) increase_batch_size_on_plateau: int = schema_utils.NonNegativeInteger( default=0, description="Number to increase the batch size by on a plateau.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau"], ) increase_batch_size_on_plateau_patience: int = schema_utils.NonNegativeInteger( default=5, description="How many epochs to wait for before increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_patience"], ) increase_batch_size_on_plateau_rate: float = schema_utils.NonNegativeFloat( default=2.0, description="Rate at which the batch size increases.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_rate"], ) increase_batch_size_on_plateau_max: int = schema_utils.PositiveInteger( default=512, description="Maximum size of the batch.", parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_max"], ) increase_batch_size_eval_metric: str = schema_utils.String( default=LOSS, description="Which metric to listen on for increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_metric"], ) increase_batch_size_eval_split: str = schema_utils.String( default=TRAINING, description="Which dataset split to listen on for increasing the batch size.", parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_split"], ) decay: bool = schema_utils.Boolean( default=False, description="Turn on exponential decay of the learning rate.", parameter_metadata=TRAINER_METADATA["decay"], ) decay_steps: int = schema_utils.PositiveInteger( default=10000, description="The number of steps to take in the exponential learning rate decay.", parameter_metadata=TRAINER_METADATA["decay_steps"], ) decay_rate: float = schema_utils.FloatRange( default=0.96, min=0.0, max=1.0, description="Decay per epoch (%): Factor to decrease the Learning rate.", parameter_metadata=TRAINER_METADATA["decay_steps"], ) staircase: bool = schema_utils.Boolean( default=False, description="Decays the learning rate at discrete intervals.", parameter_metadata=TRAINER_METADATA["staircase"], ) gradient_clipping: Optional[GradientClippingConfig] = GradientClippingDataclassField( description="Parameter values for gradient clipping.", default={}, ) learning_rate_warmup_epochs: float = schema_utils.NonNegativeFloat( default=1.0, description="Number of epochs to warmup the learning rate for.", parameter_metadata=TRAINER_METADATA["learning_rate_warmup_epochs"], ) learning_rate_scaling: str = schema_utils.StringOptions( ["constant", "sqrt", "linear"], default="linear", description=( "Scale by which to increase the learning rate as the number of distributed workers increases. " "Traditionally the learning rate is scaled linearly with the number of workers to reflect the proportion by" " which the effective batch size is increased. For very large batch sizes, a softer square-root scale can " "sometimes lead to better model performance. If the learning rate is hand-tuned for a given number of " "workers, setting this value to constant can be used to disable scale-up." ), parameter_metadata=TRAINER_METADATA["learning_rate_scaling"], )
class GBMTrainerConfig(BaseTrainerConfig): """Dataclass that configures most of the hyperparameters used for GBM model training.""" type: str = schema_utils.StringOptions( ["lightgbm_trainer"], default="lightgbm_trainer", description=( "Trainer to use for training the model. Must be one of ['lightgbm_trainer'] - " "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` " "(default: 'lightgbm_trainer')" ), allow_none=False, ) # LightGBM core parameters (https://lightgbm.readthedocs.io/en/latest/Parameters.html) boosting_type: str = schema_utils.StringOptions( ["gbdt", "rf", "dart", "goss"], default="gbdt", description="Type of boosting algorithm to use with GBM trainer.", ) tree_learner: str = schema_utils.StringOptions( ["serial", "feature", "data", "voting"], default="serial", description="Type of tree learner to use with GBM trainer.", ) num_boost_round: int = schema_utils.PositiveInteger( default=100, description="Number of boosting rounds to perform with GBM trainer." ) num_leaves: int = schema_utils.PositiveInteger( default=31, description="Number of leaves to use in the tree with GBM trainer." ) # LightGBM Learning Control params max_depth: int = schema_utils.Integer( default=-1, description="Maximum depth of a tree in the GBM trainer. A negative value means no limit.", ) min_data_in_leaf: int = schema_utils.PositiveInteger( default=20, description="Minimum number of data points in a leaf with GBM trainer." ) min_sum_hessian_in_leaf: float = schema_utils.NonNegativeFloat( default=1e-3, description="Minimum sum of hessians in a leaf with GBM trainer." ) bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of data to use for bagging with GBM trainer." ) pos_bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of positive data to use for bagging with GBM trainer." ) neg_bagging_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of negative data to use for bagging with GBM trainer." ) bagging_freq: int = schema_utils.NonNegativeInteger(default=0, description="Frequency of bagging with GBM trainer.") bagging_seed: int = schema_utils.Integer(default=3, description="Random seed for bagging with GBM trainer.") feature_fraction: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of features to use in the GBM trainer." ) feature_fraction_bynode: float = schema_utils.FloatRange( default=1.0, min=0.0, max=1.0, description="Fraction of features to use for each tree node with GBM trainer." ) feature_fraction_seed: int = schema_utils.Integer( default=2, description="Random seed for feature fraction with GBM trainer." ) extra_trees: bool = schema_utils.Boolean( default=False, description="Whether to use extremely randomized trees in the GBM trainer." ) extra_seed: int = schema_utils.Integer( default=6, description="Random seed for extremely randomized trees in the GBM trainer." ) max_delta_step: float = schema_utils.FloatRange( default=0.0, min=0.0, max=1.0, description=( "Used to limit the max output of tree leaves in the GBM trainer. A negative value means no constraint." ), ) lambda_l1: float = schema_utils.NonNegativeFloat( default=0.0, description="L1 regularization factor for the GBM trainer." ) lambda_l2: float = schema_utils.NonNegativeFloat( default=0.0, description="L2 regularization factor for the GBM trainer." ) linear_lambda: float = schema_utils.NonNegativeFloat( default=0.0, description="Linear tree regularization in the GBM trainer." ) min_gain_to_split: float = schema_utils.NonNegativeFloat( default=0.0, description="Minimum gain to split a leaf in the GBM trainer." ) drop_rate: float = schema_utils.FloatRange( default=0.1, min=0.0, max=1.0, description="Dropout rate for the GBM trainer. Used only with boosting_type 'dart'.", ) max_drop: int = schema_utils.Integer( default=50, description=( "Maximum number of dropped trees during one boosting iteration. " "Used only with boosting_type 'dart'. A negative value means no limit." ), ) skip_drop: float = schema_utils.FloatRange( default=0.5, min=0.0, max=1.0, description=( "Probability of skipping the dropout during one boosting iteration. Used only with boosting_type 'dart'." ), ) xgboost_dart_mode: bool = schema_utils.Boolean( default=False, description="Whether to use xgboost dart mode in the GBM trainer. Used only with boosting_type 'dart'.", ) uniform_drop: bool = schema_utils.Boolean( default=False, description=("Whether to use uniform dropout in the GBM trainer. Used only with boosting_type 'dart'."), ) drop_seed: int = schema_utils.Integer( default=4, description="Random seed to choose dropping models in the GBM trainer. Used only with boosting_type 'dart'.", ) top_rate: float = schema_utils.FloatRange( default=0.2, min=0.0, max=1.0, description="The retain ratio of large gradient data in the GBM trainer. Used only with boosting_type 'goss'.", ) other_rate: float = schema_utils.FloatRange( default=0.1, min=0.0, max=1.0, description="The retain ratio of small gradient data in the GBM trainer. Used only with boosting_type 'goss'.", ) min_data_per_group: int = schema_utils.PositiveInteger( default=100, description="Minimum number of data points per categorical group for the GBM trainer.", ) max_cat_threshold: int = schema_utils.PositiveInteger( default=32, description="Number of split points considered for categorical features for the GBM trainer.", ) cat_l2: float = schema_utils.NonNegativeFloat( default=10.0, description="L2 regularization factor for categorical split in the GBM trainer." ) cat_smooth: float = schema_utils.NonNegativeFloat( default=10.0, description="Smoothing factor for categorical split in the GBM trainer." ) max_cat_to_onehot: int = schema_utils.PositiveInteger( default=4, description="Maximum categorical cardinality required before one-hot encoding in the GBM trainer.", ) cegb_tradeoff: float = schema_utils.NonNegativeFloat( default=1.0, description="Cost-effective gradient boosting multiplier for all penalties in the GBM trainer.", ) cegb_penalty_split: float = schema_utils.NonNegativeFloat( default=0.0, description="Cost-effective gradient boosting penalty for splitting a node in the GBM trainer.", ) path_smooth: float = schema_utils.NonNegativeFloat( default=0.0, description="Smoothing factor applied to tree nodes in the GBM trainer.", ) verbose: int = schema_utils.IntegerRange(default=0, min=-1, max=2, description="Verbosity level for GBM trainer.") # LightGBM IO params max_bin: int = schema_utils.PositiveInteger( default=255, description="Maximum number of bins to use for discretizing features with GBM trainer." )
class BaseTrainerConfig(schema_utils.BaseMarshmallowConfig, ABC): """Common trainer parameter values.""" type: str learning_rate: float = schema_utils.FloatOrAutoField( default=0.001, min=0.0, max=1.0, default_numeric=0.001, allow_none=False, description=( "Controls how much to change the model in response to the estimated error each time the model weights are " "updated. If 'auto', the optimal learning rate is estimated by choosing the learning rate that produces " "the smallest non-diverging gradient update." ), parameter_metadata=TRAINER_METADATA["learning_rate"], ) validation_metric: str = schema_utils.String( default=LOSS, description=( "Metric used on `validation_field`, set by default to the " "output feature type's `default_validation_metric`." ), parameter_metadata=TRAINER_METADATA["validation_metric"], ) # TODO(#1673): Need some more logic here for validating against output features validation_field: str = schema_utils.String( default=COMBINED, description="First output feature, by default it is set as the same field of the first output feature.", parameter_metadata=TRAINER_METADATA["validation_field"], ) eval_batch_size: Union[None, int, str] = schema_utils.IntegerOrAutoField( default=None, allow_none=True, min_exclusive=0, description=( "Size of batch to pass to the model for evaluation. If it is `0` or `None`, the same value of `batch_size` " "is used. This is useful to speedup evaluation with a much bigger batch size than training, if enough " "memory is available. If ’auto’, the biggest batch size (power of 2) that can fit in memory will be used." ), parameter_metadata=TRAINER_METADATA["eval_batch_size"], ) early_stop: int = schema_utils.IntegerRange( default=5, min=-1, description=( "Number of consecutive rounds of evaluation without any improvement on the `validation_metric` that " "triggers training to stop. Can be set to -1, which disables early stopping entirely." ), parameter_metadata=TRAINER_METADATA["early_stop"], ) evaluate_training_set: bool = schema_utils.Boolean( default=True, description="Whether to include the entire training set during evaluation.", parameter_metadata=TRAINER_METADATA["evaluate_training_set"], )
class CommonTransformerConfig: """Common transformer parameter values.""" num_layers: int = schema_utils.PositiveInteger(default=1, description="") hidden_size: int = schema_utils.NonNegativeInteger( default=256, description= ("The number of hidden units of the TransformerStack as well as the dimension that each incoming input " "feature is projected to before feeding to the TransformerStack"), ) num_heads: int = schema_utils.NonNegativeInteger( default=8, description= "Number of heads of the self attention in the transformer block.") transformer_output_size: int = schema_utils.NonNegativeInteger( default=256, description= ("Size of the fully connected layer after self attention in the transformer block. This is usually the same " "as `hidden_size` and `embedding_size`."), ) dropout: float = schema_utils.FloatRange( default=0.1, min=0, max=1, description="Dropout rate for the transformer block.") fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList( description="") # TODO(#1673): Add conditional logic for fields like this one: num_fc_layers: int = schema_utils.NonNegativeInteger( default=0, description= "The number of stacked fully connected layers (only applies if `reduce_output` is not null).", ) output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of a fully connected layer.") use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.") weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="") bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="") norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"], description="") norm_params: Optional[dict] = schema_utils.Dict(description="") fc_activation: str = schema_utils.ActivationOptions(default="relu", description="") fc_dropout: float = schema_utils.FloatRange(default=0.0, min=0, max=1, description="") fc_residual: bool = schema_utils.Boolean(default=False, description="")
class TextPreprocessingConfig(schema_utils.BaseMarshmallowConfig): """TextPreprocessingConfig is a dataclass that configures the parameters used for a text input feature.""" pretrained_model_name_or_path: str = schema_utils.String( default=None, allow_none=True, description= "This can be either the name of a pretrained HuggingFace model or a path where it was downloaded", ) tokenizer: str = schema_utils.StringOptions( tokenizer_registry.keys(), default="space_punct", allow_none=False, description= "Defines how to map from the raw string content of the dataset column to a sequence of elements.", ) vocab_file: str = schema_utils.String( default=None, allow_none=True, description= "Filepath string to a UTF-8 encoded file containing the sequence's vocabulary. On each line the " "first string until \t or \n is considered a word.", ) max_sequence_length: int = schema_utils.PositiveInteger( default=256, allow_none=False, description= "The maximum length (number of tokens) of the text. Texts that are longer than this value will be " "truncated, while texts that are shorter will be padded.", ) most_common: int = schema_utils.PositiveInteger( default=20000, allow_none=False, description= "The maximum number of most common tokens in the vocabulary. If the data contains more than this " "amount, the most infrequent symbols will be treated as unknown.", ) padding_symbol: str = schema_utils.String( default="<PAD>", allow_none=False, description= "The string used as the padding symbol for sequence features. Ignored for features using " "huggingface encoders, which have their own vocabulary.", ) unknown_symbol: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The string used as the unknown symbol for sequence features. Ignored for features using " "huggingface encoders, which have their own vocabulary.", ) padding: str = schema_utils.StringOptions( ["left", "right"], default="right", allow_none=False, description= "the direction of the padding. right and left are available options.", ) lowercase: bool = schema_utils.Boolean( default=True, description= "If true, converts the string to lowercase before tokenizing.", ) missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default="fill_with_const", allow_none=False, description= "What strategy to follow when there's a missing value in a text column", ) fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The value to replace missing values with in case the missing_value_strategy is fill_with_const", ) computed_fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], )
class AudioPreprocessingConfig(schema_utils.BaseMarshmallowConfig): audio_file_length_limit_in_s: int = schema_utils.NonNegativeFloat( default=7.5, allow_none=False, description= "Float value that defines the maximum limit of the audio file in seconds. All files longer than " "this limit are cut off. All files shorter than this limit are padded with padding_value", ) missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default=BACKFILL, allow_none=False, description= "What strategy to follow when there's a missing value in an audio column", ) fill_value: float = schema_utils.NonNegativeFloat( default=None, allow_none=True, description= "The value to replace missing values with in case the missing_value_strategy is fill_with_const", ) computed_fill_value: float = schema_utils.NonNegativeFloat( default=None, allow_none=True, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], ) in_memory: bool = schema_utils.Boolean( default=True, description= "Defines whether the audio dataset will reside in memory during the training process or will be " "dynamically fetched from disk (useful for large datasets). In the latter case a training batch " "of input audio will be fetched from disk each training iteration.", ) padding_value: float = schema_utils.NonNegativeFloat( default=0.0, allow_none=False, description="Float value that is used for padding.") norm: str = schema_utils.StringOptions( ["per_file"], default=None, allow_none=True, description= "Normalization strategy for the audio files. If None, no normalization is performed. If " "per_file, z-norm is applied on a 'per file' level", ) type: str = schema_utils.StringOptions( ["fbank", "group_delay", "raw", "stft", "stft_phase"], default="fbank", description="Defines the type of audio feature to be used.", ) window_length_in_s: float = schema_utils.NonNegativeFloat( default=0.04, description= "Defines the window length used for the short time Fourier transformation. This is only needed if " "the audio_feature_type is 'raw'.", ) window_shift_in_s: float = schema_utils.NonNegativeFloat( default=0.02, description= "Defines the window shift used for the short time Fourier transformation (also called " "hop_length). This is only needed if the audio_feature_type is 'raw'. ", ) num_fft_points: float = schema_utils.NonNegativeFloat( default=None, description= "Defines the number of fft points used for the short time Fourier transformation" ) window_type: str = schema_utils.StringOptions( ["bartlett", "blackman", "hamming", "hann"], default="hamming", description= "Defines the type window the signal is weighted before the short time Fourier transformation.", ) num_filter_bands: int = schema_utils.PositiveInteger( default=80, description= "Defines the number of filters used in the filterbank. Only needed if audio_feature_type " "is 'fbank'", )
class ImagePreprocessingConfig(schema_utils.BaseMarshmallowConfig): missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default="backfill", allow_none=False, description= "What strategy to follow when there's a missing value in an image column", ) fill_value: float = schema_utils.NonNegativeFloat( default=None, allow_none=True, description= "The maximum number of most common tokens to be considered. If the data contains more than this " "amount, the most infrequent tokens will be treated as unknown.", ) computed_fill_value: float = schema_utils.NonNegativeFloat( default=None, allow_none=True, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], ) height: int = schema_utils.PositiveInteger( default=None, allow_none=True, description= "The image height in pixels. If this parameter is set, images will be resized to the specified " "height using the resize_method parameter. If None, images will be resized to the size of the " "first image in the dataset.", ) width: int = schema_utils.PositiveInteger( default=None, allow_none=True, description= "The image width in pixels. If this parameter is set, images will be resized to the specified " "width using the resize_method parameter. If None, images will be resized to the size of the " "first image in the dataset.", ) num_channels: int = schema_utils.PositiveInteger( default=None, allow_none=True, description= "Number of channels in the images. If specified, images will be read in the mode specified by the " "number of channels. If not specified, the number of channels will be inferred from the image " "format of the first valid image in the dataset.", ) resize_method: str = schema_utils.StringOptions( ["crop_or_pad", "interpolate"], default="interpolate", allow_none=False, description="The method to use for resizing images.", ) infer_image_num_channels: bool = schema_utils.Boolean( default=True, description= "If true, then the number of channels in the dataset is inferred from a sample of the first image " "in the dataset.", ) infer_image_dimensions: bool = schema_utils.Boolean( default=True, description= "If true, then the height and width of images in the dataset will be inferred from a sample of " "the first image in the dataset. Each image that doesn't conform to these dimensions will be " "resized according to resize_method. If set to false, then the height and width of images in the " "dataset will be specified by the user.", ) infer_image_max_height: int = schema_utils.PositiveInteger( default=256, allow_none=False, description= "If infer_image_dimensions is set, this is used as the maximum height of the images in " "the dataset.", ) infer_image_max_width: int = schema_utils.PositiveInteger( default=256, allow_none=False, description= "If infer_image_dimensions is set, this is used as the maximum width of the images in " "the dataset.", ) infer_image_sample_size: int = schema_utils.PositiveInteger( default=100, allow_none=False, description= "The sample size used for inferring dimensions of images in infer_image_dimensions.", ) scaling: str = schema_utils.StringOptions( ["pixel_normalization", "pixel_standardization"], default="pixel_normalization", allow_none=False, description="The scaling strategy for pixel values in the image.", ) in_memory: bool = schema_utils.Boolean( default=True, description= "Defines whether image dataset will reside in memory during the training process or will be " "dynamically fetched from disk (useful for large datasets). In the latter case a training batch " "of input images will be fetched from disk each training iteration.", ) num_processes: int = schema_utils.PositiveInteger( default=1, allow_none=False, description= "Specifies the number of processes to run for preprocessing images.", )
class SequencePreprocessingConfig(schema_utils.BaseMarshmallowConfig): tokenizer: str = schema_utils.String( default="space", allow_none=False, description= "Defines how to map from the raw string content of the dataset column to a sequence of elements.", ) vocab_file: str = schema_utils.String( default=None, allow_none=True, description= "Filepath string to a UTF-8 encoded file containing the sequence's vocabulary. On each line the " "first string until \t or \n is considered a word.", ) max_sequence_length: int = schema_utils.PositiveInteger( default=256, allow_none=False, description= "The maximum length (number of tokens) of the text. Texts that are longer than this value will be " "truncated, while texts that are shorter will be padded.", ) most_common: int = schema_utils.PositiveInteger( default=20000, allow_none=False, description= "The maximum number of most common tokens in the vocabulary. If the data contains more than this " "amount, the most infrequent symbols will be treated as unknown.", ) padding_symbol: str = schema_utils.String( default="<PAD>", allow_none=False, description= "The string used as a padding symbol. This special token is mapped to the integer ID 0 in the " "vocabulary.", ) unknown_symbol: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The string used as an unknown placeholder. This special token is mapped to the integer ID 1 in " "the vocabulary.", ) padding: str = schema_utils.StringOptions( ["left", "right"], default="right", allow_none=False, description= "the direction of the padding. right and left are available options.", ) lowercase: bool = schema_utils.Boolean( default=False, description= "If true, converts the string to lowercase before tokenizing.", ) missing_value_strategy: str = schema_utils.StringOptions( MISSING_VALUE_STRATEGY_OPTIONS, default="fill_with_const", allow_none=False, description= "What strategy to follow when there's a missing value in a text column", ) fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The value to replace missing values with in case the missing_value_strategy is fill_with_const", ) computed_fill_value: str = schema_utils.String( default=strings_utils.UNKNOWN_SYMBOL, allow_none=False, description= "The internally computed fill value to replace missing values with in case the " "missing_value_strategy is fill_with_mode or fill_with_mean", parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"], )