예제 #1
0
class ProjectAggregateCombinerConfig(BaseCombinerConfig):
    projection_size: int = schema_utils.PositiveInteger(
        default=128,
        description=
        "All combiner inputs are projected to this size before being aggregated."
    )
    fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList(
        description=
        "Full secification of the fully connected layers after the aggregation. "
        "It should be a list of dict, each disct representing one layer.")
    num_fc_layers: int = schema_utils.NonNegativeInteger(
        default=2,
        description="Number of fully connected layers after aggregation.")
    output_size: int = schema_utils.PositiveInteger(
        default=128,
        description=
        "Output size of each layer of the stack of fully connected layers.")
    use_bias: bool = schema_utils.Boolean(
        default=True, description="Whether the layers use a bias vector.")
    weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="xavier_uniform",
        description=
        "Initializer to use for the weights of the projection and for the fully connected layers.",
    )
    bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="zeros",
        description=
        "Initializer to use for the baias of the projection and for the fully connected layers.",
    )
    norm: Optional[str] = schema_utils.StringOptions(
        ["batch", "layer"],
        default="layer",
        description=
        "Normalization to apply to each projection and fully connected layer.",
    )
    norm_params: Optional[dict] = schema_utils.Dict(
        description=
        "Parameters of the normalization to apply to each projection and fully connected layer."
    )
    activation: str = schema_utils.ActivationOptions(
        default="relu",
        description="Activation to apply to each fully connected layer.")
    dropout: float = schema_utils.FloatRange(
        default=0.0,
        min=0,
        max=1,
        description="Dropout rate to apply to each fully connected layer.")
    residual: bool = schema_utils.Boolean(
        default=True,
        description=
        "Whether to add residual skip connection between the fully connected layers in the stack..",
    )
예제 #2
0
파일: concat.py 프로젝트: ludwig-ai/ludwig
class ConcatCombinerConfig(BaseCombinerConfig):
    """Parameters for concat combiner."""

    fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList(
        description="")

    num_fc_layers: int = schema_utils.NonNegativeInteger(default=0,
                                                         description="")

    output_size: int = schema_utils.PositiveInteger(
        default=256, description="Output size of a fully connected layer.")

    use_bias: bool = schema_utils.Boolean(
        default=True, description="Whether the layer uses a bias vector.")

    weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="xavier_uniform", description="")

    bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="zeros", description="")

    norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"],
                                                     description="")

    norm_params: Optional[dict] = schema_utils.Dict(description="")

    activation: str = schema_utils.ActivationOptions(default="relu",
                                                     description="")

    dropout: float = schema_utils.FloatRange(default=0.0,
                                             min=0,
                                             max=1,
                                             description="")

    flatten_inputs: bool = schema_utils.Boolean(
        default=False,
        description="Whether to flatten input tensors to a vector.")

    residual: bool = schema_utils.Boolean(
        default=False,
        description=
        ("Whether to add a residual connection to each fully connected layer block. All fully connected layers must"
         " have the same size"),
    )
예제 #3
0
class BagPreprocessingConfig(schema_utils.BaseMarshmallowConfig):

    tokenizer: str = schema_utils.StringOptions(
        tokenizer_registry.keys(),
        default="space",
        allow_none=False,
        description=
        "Defines how to transform the raw text content of the dataset column to a set of elements. The "
        "default value space splits the string on spaces. Common options include: underscore (splits on "
        "underscore), comma (splits on comma), json (decodes the string into a set or a list through a "
        "JSON parser).",
    )

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default="fill_with_const",
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in a set column",
    )

    fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The value to replace missing values with in case the missing_value_strategy is fill_with_const",
    )

    computed_fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )

    lowercase: bool = schema_utils.Boolean(
        default=False,
        description=
        "If true, converts the string to lowercase before tokenizing.",
    )

    most_common: int = schema_utils.PositiveInteger(
        default=10000,
        allow_none=True,
        description=
        "The maximum number of most common tokens to be considered. If the data contains more than this "
        "amount, the most infrequent tokens will be treated as unknown.",
    )
예제 #4
0
class CategoryPreprocessingConfig(schema_utils.BaseMarshmallowConfig):
    """CategoryPreprocessingConfig is a dataclass that configures the parameters used for a category input
    feature."""

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default="fill_with_const",
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in a category column",
    )

    fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The value to replace missing values with in case the missing_value_strategy is fill_with_const",
    )

    computed_fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )

    lowercase: bool = schema_utils.Boolean(
        default=False,
        description=
        "Whether the string has to be lowercased before being handled by the tokenizer.",
    )

    most_common: int = schema_utils.PositiveInteger(
        default=10000,
        allow_none=True,
        description=
        "The maximum number of most common tokens to be considered. if the data contains more than this "
        "amount, the most infrequent tokens will be treated as unknown.",
    )
예제 #5
0
class ComparatorCombinerConfig(BaseCombinerConfig):
    """Parameters for comparator combiner."""

    entity_1: List[str]
    """TODO: Document parameters."""

    entity_2: List[str]
    """TODO: Document parameters."""

    fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList(
        description="")

    num_fc_layers: int = schema_utils.NonNegativeInteger(default=1,
                                                         description="")

    output_size: int = schema_utils.PositiveInteger(
        default=256, description="Output size of a fully connected layer")

    use_bias: bool = schema_utils.Boolean(
        default=True, description="Whether the layer uses a bias vector.")

    weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="xavier_uniform", description="")

    bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="zeros", description="")

    norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"],
                                                     description="")

    norm_params: Optional[dict] = schema_utils.Dict(description="")

    activation: str = schema_utils.ActivationOptions(default="relu",
                                                     description="")

    dropout: float = schema_utils.FloatRange(
        default=0.0,
        min=0,
        max=1,
        description="Dropout rate for the transformer block.")
예제 #6
0
class ECDTrainerConfig(BaseTrainerConfig):
    """Dataclass that configures most of the hyperparameters used for ECD model training."""

    type: str = schema_utils.StringOptions(
        ["trainer", "ray_legacy_trainer"],
        default="trainer",
        description=(
            "Trainer to use for training the model. Must be one of ['trainer', 'ray_legacy_trainer'] - "
            "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` (default: 'trainer')"
        ),
        allow_none=False,
    )

    optimizer: BaseOptimizerConfig = OptimizerDataclassField(
        default={"type": "adam"}, description="Parameter values for selected torch optimizer."
    )

    epochs: int = schema_utils.PositiveInteger(
        default=100,
        description="Number of epochs the algorithm is intended to be run over.",
        parameter_metadata=TRAINER_METADATA["epochs"],
    )

    train_steps: int = schema_utils.PositiveInteger(
        default=None,
        allow_none=True,
        description=(
            "Maximum number of training steps the algorithm is intended to be run over. "
            + "If unset, then `epochs` is used to determine training length."
        ),
        parameter_metadata=TRAINER_METADATA["train_steps"],
    )

    regularization_lambda: float = schema_utils.FloatRange(
        default=0.0,
        min=0,
        description="Strength of the $L2$ regularization.",
        parameter_metadata=TRAINER_METADATA["regularization_lambda"],
    )

    regularization_type: Optional[str] = schema_utils.RegularizerOptions(
        default="l2", description="Type of regularization."
    )

    should_shuffle: bool = schema_utils.Boolean(
        default=True,
        description="Whether to shuffle batches during training when true.",
        parameter_metadata=TRAINER_METADATA["should_shuffle"],
    )

    batch_size: Union[int, str] = schema_utils.IntegerOrAutoField(
        default=128,
        default_numeric=128,
        allow_none=False,
        min_exclusive=0,
        description=(
            "The number of training examples utilized in one training step of the model. If ’auto’, the "
            "biggest batch size (power of 2) that can fit in memory will be used."
        ),
        parameter_metadata=TRAINER_METADATA["batch_size"],
    )

    steps_per_checkpoint: int = schema_utils.NonNegativeInteger(
        default=0,
        description=(
            "How often the model is checkpointed. Also dictates maximum evaluation frequency. If 0 the model is "
            "checkpointed after every epoch."
        ),
        parameter_metadata=TRAINER_METADATA["steps_per_checkpoint"],
    )

    checkpoints_per_epoch: int = schema_utils.NonNegativeInteger(
        default=0,
        description=(
            "Number of checkpoints per epoch. For example, 2 -> checkpoints are written every half of an epoch. Note "
            "that it is invalid to specify both non-zero `steps_per_checkpoint` and non-zero `checkpoints_per_epoch`."
        ),
        parameter_metadata=TRAINER_METADATA["checkpoints_per_epoch"],
    )

    reduce_learning_rate_on_plateau: float = schema_utils.FloatRange(
        default=0.0,
        min=0.0,
        max=1.0,
        description=(
            "Reduces the learning rate when the algorithm hits a plateau (i.e. the performance on the validation does "
            "not improve."
        ),
        parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau"],
    )

    reduce_learning_rate_on_plateau_patience: int = schema_utils.NonNegativeInteger(
        default=5,
        description="How many epochs have to pass before the learning rate reduces.",
        parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_patience"],
    )

    reduce_learning_rate_on_plateau_rate: float = schema_utils.FloatRange(
        default=0.5,
        min=0.0,
        max=1.0,
        description="Rate at which we reduce the learning rate.",
        parameter_metadata=TRAINER_METADATA["reduce_learning_rate_on_plateau_rate"],
    )

    reduce_learning_rate_eval_metric: str = schema_utils.String(
        default=LOSS,
        description="Rate at which we reduce the learning rate.",
        parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_metric"],
    )

    reduce_learning_rate_eval_split: str = schema_utils.String(
        default=TRAINING,
        description="Which dataset split to listen on for reducing the learning rate.",
        parameter_metadata=TRAINER_METADATA["reduce_learning_rate_eval_split"],
    )

    increase_batch_size_on_plateau: int = schema_utils.NonNegativeInteger(
        default=0,
        description="Number to increase the batch size by on a plateau.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau"],
    )

    increase_batch_size_on_plateau_patience: int = schema_utils.NonNegativeInteger(
        default=5,
        description="How many epochs to wait for before increasing the batch size.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_patience"],
    )

    increase_batch_size_on_plateau_rate: float = schema_utils.NonNegativeFloat(
        default=2.0,
        description="Rate at which the batch size increases.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_rate"],
    )

    increase_batch_size_on_plateau_max: int = schema_utils.PositiveInteger(
        default=512,
        description="Maximum size of the batch.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_on_plateau_max"],
    )

    increase_batch_size_eval_metric: str = schema_utils.String(
        default=LOSS,
        description="Which metric to listen on for increasing the batch size.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_metric"],
    )

    increase_batch_size_eval_split: str = schema_utils.String(
        default=TRAINING,
        description="Which dataset split to listen on for increasing the batch size.",
        parameter_metadata=TRAINER_METADATA["increase_batch_size_eval_split"],
    )

    decay: bool = schema_utils.Boolean(
        default=False,
        description="Turn on exponential decay of the learning rate.",
        parameter_metadata=TRAINER_METADATA["decay"],
    )

    decay_steps: int = schema_utils.PositiveInteger(
        default=10000,
        description="The number of steps to take in the exponential learning rate decay.",
        parameter_metadata=TRAINER_METADATA["decay_steps"],
    )

    decay_rate: float = schema_utils.FloatRange(
        default=0.96,
        min=0.0,
        max=1.0,
        description="Decay per epoch (%): Factor to decrease the Learning rate.",
        parameter_metadata=TRAINER_METADATA["decay_steps"],
    )

    staircase: bool = schema_utils.Boolean(
        default=False,
        description="Decays the learning rate at discrete intervals.",
        parameter_metadata=TRAINER_METADATA["staircase"],
    )

    gradient_clipping: Optional[GradientClippingConfig] = GradientClippingDataclassField(
        description="Parameter values for gradient clipping.",
        default={},
    )

    learning_rate_warmup_epochs: float = schema_utils.NonNegativeFloat(
        default=1.0,
        description="Number of epochs to warmup the learning rate for.",
        parameter_metadata=TRAINER_METADATA["learning_rate_warmup_epochs"],
    )

    learning_rate_scaling: str = schema_utils.StringOptions(
        ["constant", "sqrt", "linear"],
        default="linear",
        description=(
            "Scale by which to increase the learning rate as the number of distributed workers increases. "
            "Traditionally the learning rate is scaled linearly with the number of workers to reflect the proportion by"
            " which the effective batch size is increased. For very large batch sizes, a softer square-root scale can "
            "sometimes lead to better model performance. If the learning rate is hand-tuned for a given number of "
            "workers, setting this value to constant can be used to disable scale-up."
        ),
        parameter_metadata=TRAINER_METADATA["learning_rate_scaling"],
    )
예제 #7
0
class GBMTrainerConfig(BaseTrainerConfig):
    """Dataclass that configures most of the hyperparameters used for GBM model training."""

    type: str = schema_utils.StringOptions(
        ["lightgbm_trainer"],
        default="lightgbm_trainer",
        description=(
            "Trainer to use for training the model. Must be one of ['lightgbm_trainer'] - "
            "corresponds to name in `ludwig.trainers.registry.(ray_)trainers_registry` "
            "(default: 'lightgbm_trainer')"
        ),
        allow_none=False,
    )

    # LightGBM core parameters (https://lightgbm.readthedocs.io/en/latest/Parameters.html)
    boosting_type: str = schema_utils.StringOptions(
        ["gbdt", "rf", "dart", "goss"],
        default="gbdt",
        description="Type of boosting algorithm to use with GBM trainer.",
    )

    tree_learner: str = schema_utils.StringOptions(
        ["serial", "feature", "data", "voting"],
        default="serial",
        description="Type of tree learner to use with GBM trainer.",
    )

    num_boost_round: int = schema_utils.PositiveInteger(
        default=100, description="Number of boosting rounds to perform with GBM trainer."
    )

    num_leaves: int = schema_utils.PositiveInteger(
        default=31, description="Number of leaves to use in the tree with GBM trainer."
    )

    # LightGBM Learning Control params
    max_depth: int = schema_utils.Integer(
        default=-1,
        description="Maximum depth of a tree in the GBM trainer. A negative value means no limit.",
    )

    min_data_in_leaf: int = schema_utils.PositiveInteger(
        default=20, description="Minimum number of data points in a leaf with GBM trainer."
    )

    min_sum_hessian_in_leaf: float = schema_utils.NonNegativeFloat(
        default=1e-3, description="Minimum sum of hessians in a leaf with GBM trainer."
    )

    bagging_fraction: float = schema_utils.FloatRange(
        default=1.0, min=0.0, max=1.0, description="Fraction of data to use for bagging with GBM trainer."
    )

    pos_bagging_fraction: float = schema_utils.FloatRange(
        default=1.0, min=0.0, max=1.0, description="Fraction of positive data to use for bagging with GBM trainer."
    )

    neg_bagging_fraction: float = schema_utils.FloatRange(
        default=1.0, min=0.0, max=1.0, description="Fraction of negative data to use for bagging with GBM trainer."
    )

    bagging_freq: int = schema_utils.NonNegativeInteger(default=0, description="Frequency of bagging with GBM trainer.")

    bagging_seed: int = schema_utils.Integer(default=3, description="Random seed for bagging with GBM trainer.")

    feature_fraction: float = schema_utils.FloatRange(
        default=1.0, min=0.0, max=1.0, description="Fraction of features to use in the GBM trainer."
    )

    feature_fraction_bynode: float = schema_utils.FloatRange(
        default=1.0, min=0.0, max=1.0, description="Fraction of features to use for each tree node with GBM trainer."
    )

    feature_fraction_seed: int = schema_utils.Integer(
        default=2, description="Random seed for feature fraction with GBM trainer."
    )

    extra_trees: bool = schema_utils.Boolean(
        default=False, description="Whether to use extremely randomized trees in the GBM trainer."
    )

    extra_seed: int = schema_utils.Integer(
        default=6, description="Random seed for extremely randomized trees in the GBM trainer."
    )

    max_delta_step: float = schema_utils.FloatRange(
        default=0.0,
        min=0.0,
        max=1.0,
        description=(
            "Used to limit the max output of tree leaves in the GBM trainer. A negative value means no constraint."
        ),
    )

    lambda_l1: float = schema_utils.NonNegativeFloat(
        default=0.0, description="L1 regularization factor for the GBM trainer."
    )

    lambda_l2: float = schema_utils.NonNegativeFloat(
        default=0.0, description="L2 regularization factor for the GBM trainer."
    )

    linear_lambda: float = schema_utils.NonNegativeFloat(
        default=0.0, description="Linear tree regularization in the GBM trainer."
    )

    min_gain_to_split: float = schema_utils.NonNegativeFloat(
        default=0.0, description="Minimum gain to split a leaf in the GBM trainer."
    )

    drop_rate: float = schema_utils.FloatRange(
        default=0.1,
        min=0.0,
        max=1.0,
        description="Dropout rate for the GBM trainer. Used only with boosting_type 'dart'.",
    )

    max_drop: int = schema_utils.Integer(
        default=50,
        description=(
            "Maximum number of dropped trees during one boosting iteration. "
            "Used only with boosting_type 'dart'. A negative value means no limit."
        ),
    )

    skip_drop: float = schema_utils.FloatRange(
        default=0.5,
        min=0.0,
        max=1.0,
        description=(
            "Probability of skipping the dropout during one boosting iteration. Used only with boosting_type 'dart'."
        ),
    )

    xgboost_dart_mode: bool = schema_utils.Boolean(
        default=False,
        description="Whether to use xgboost dart mode in the GBM trainer. Used only with boosting_type 'dart'.",
    )

    uniform_drop: bool = schema_utils.Boolean(
        default=False,
        description=("Whether to use uniform dropout in the GBM trainer. Used only with boosting_type 'dart'."),
    )

    drop_seed: int = schema_utils.Integer(
        default=4,
        description="Random seed to choose dropping models in the GBM trainer. Used only with boosting_type 'dart'.",
    )

    top_rate: float = schema_utils.FloatRange(
        default=0.2,
        min=0.0,
        max=1.0,
        description="The retain ratio of large gradient data in the GBM trainer. Used only with boosting_type 'goss'.",
    )

    other_rate: float = schema_utils.FloatRange(
        default=0.1,
        min=0.0,
        max=1.0,
        description="The retain ratio of small gradient data in the GBM trainer. Used only with boosting_type 'goss'.",
    )

    min_data_per_group: int = schema_utils.PositiveInteger(
        default=100,
        description="Minimum number of data points per categorical group for the GBM trainer.",
    )

    max_cat_threshold: int = schema_utils.PositiveInteger(
        default=32,
        description="Number of split points considered for categorical features for the GBM trainer.",
    )

    cat_l2: float = schema_utils.NonNegativeFloat(
        default=10.0, description="L2 regularization factor for categorical split in the GBM trainer."
    )

    cat_smooth: float = schema_utils.NonNegativeFloat(
        default=10.0, description="Smoothing factor for categorical split in the GBM trainer."
    )

    max_cat_to_onehot: int = schema_utils.PositiveInteger(
        default=4,
        description="Maximum categorical cardinality required before one-hot encoding in the GBM trainer.",
    )

    cegb_tradeoff: float = schema_utils.NonNegativeFloat(
        default=1.0,
        description="Cost-effective gradient boosting multiplier for all penalties in the GBM trainer.",
    )

    cegb_penalty_split: float = schema_utils.NonNegativeFloat(
        default=0.0,
        description="Cost-effective gradient boosting penalty for splitting a node in the GBM trainer.",
    )

    path_smooth: float = schema_utils.NonNegativeFloat(
        default=0.0,
        description="Smoothing factor applied to tree nodes in the GBM trainer.",
    )

    verbose: int = schema_utils.IntegerRange(default=0, min=-1, max=2, description="Verbosity level for GBM trainer.")

    # LightGBM IO params
    max_bin: int = schema_utils.PositiveInteger(
        default=255, description="Maximum number of bins to use for discretizing features with GBM trainer."
    )
예제 #8
0
class BaseTrainerConfig(schema_utils.BaseMarshmallowConfig, ABC):
    """Common trainer parameter values."""

    type: str

    learning_rate: float = schema_utils.FloatOrAutoField(
        default=0.001,
        min=0.0,
        max=1.0,
        default_numeric=0.001,
        allow_none=False,
        description=(
            "Controls how much to change the model in response to the estimated error each time the model weights are "
            "updated. If 'auto', the optimal learning rate is estimated by choosing the learning rate that produces "
            "the smallest non-diverging gradient update."
        ),
        parameter_metadata=TRAINER_METADATA["learning_rate"],
    )

    validation_metric: str = schema_utils.String(
        default=LOSS,
        description=(
            "Metric used on `validation_field`, set by default to the "
            "output feature type's `default_validation_metric`."
        ),
        parameter_metadata=TRAINER_METADATA["validation_metric"],
    )

    # TODO(#1673): Need some more logic here for validating against output features
    validation_field: str = schema_utils.String(
        default=COMBINED,
        description="First output feature, by default it is set as the same field of the first output feature.",
        parameter_metadata=TRAINER_METADATA["validation_field"],
    )

    eval_batch_size: Union[None, int, str] = schema_utils.IntegerOrAutoField(
        default=None,
        allow_none=True,
        min_exclusive=0,
        description=(
            "Size of batch to pass to the model for evaluation. If it is `0` or `None`, the same value of `batch_size` "
            "is used. This is useful to speedup evaluation with a much bigger batch size than training, if enough "
            "memory is available. If ’auto’, the biggest batch size (power of 2) that can fit in memory will be used."
        ),
        parameter_metadata=TRAINER_METADATA["eval_batch_size"],
    )

    early_stop: int = schema_utils.IntegerRange(
        default=5,
        min=-1,
        description=(
            "Number of consecutive rounds of evaluation without any improvement on the `validation_metric` that "
            "triggers training to stop. Can be set to -1, which disables early stopping entirely."
        ),
        parameter_metadata=TRAINER_METADATA["early_stop"],
    )

    evaluate_training_set: bool = schema_utils.Boolean(
        default=True,
        description="Whether to include the entire training set during evaluation.",
        parameter_metadata=TRAINER_METADATA["evaluate_training_set"],
    )
예제 #9
0
class CommonTransformerConfig:
    """Common transformer parameter values."""

    num_layers: int = schema_utils.PositiveInteger(default=1, description="")

    hidden_size: int = schema_utils.NonNegativeInteger(
        default=256,
        description=
        ("The number of hidden units of the TransformerStack as well as the dimension that each incoming input "
         "feature is projected to before feeding to the TransformerStack"),
    )

    num_heads: int = schema_utils.NonNegativeInteger(
        default=8,
        description=
        "Number of heads of the self attention in the transformer block.")

    transformer_output_size: int = schema_utils.NonNegativeInteger(
        default=256,
        description=
        ("Size of the fully connected layer after self attention in the transformer block. This is usually the same "
         "as `hidden_size` and `embedding_size`."),
    )

    dropout: float = schema_utils.FloatRange(
        default=0.1,
        min=0,
        max=1,
        description="Dropout rate for the transformer block.")

    fc_layers: Optional[List[Dict[str, Any]]] = schema_utils.DictList(
        description="")

    # TODO(#1673): Add conditional logic for fields like this one:
    num_fc_layers: int = schema_utils.NonNegativeInteger(
        default=0,
        description=
        "The number of stacked fully connected layers (only applies if `reduce_output` is not null).",
    )

    output_size: int = schema_utils.PositiveInteger(
        default=256, description="Output size of a fully connected layer.")

    use_bias: bool = schema_utils.Boolean(
        default=True, description="Whether the layer uses a bias vector.")

    weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="xavier_uniform", description="")

    bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
        default="zeros", description="")

    norm: Optional[str] = schema_utils.StringOptions(["batch", "layer"],
                                                     description="")

    norm_params: Optional[dict] = schema_utils.Dict(description="")

    fc_activation: str = schema_utils.ActivationOptions(default="relu",
                                                        description="")

    fc_dropout: float = schema_utils.FloatRange(default=0.0,
                                                min=0,
                                                max=1,
                                                description="")

    fc_residual: bool = schema_utils.Boolean(default=False, description="")
예제 #10
0
class TextPreprocessingConfig(schema_utils.BaseMarshmallowConfig):
    """TextPreprocessingConfig is a dataclass that configures the parameters used for a text input feature."""

    pretrained_model_name_or_path: str = schema_utils.String(
        default=None,
        allow_none=True,
        description=
        "This can be either the name of a pretrained HuggingFace model or a path where it was downloaded",
    )

    tokenizer: str = schema_utils.StringOptions(
        tokenizer_registry.keys(),
        default="space_punct",
        allow_none=False,
        description=
        "Defines how to map from the raw string content of the dataset column to a sequence of elements.",
    )

    vocab_file: str = schema_utils.String(
        default=None,
        allow_none=True,
        description=
        "Filepath string to a UTF-8 encoded file containing the sequence's vocabulary. On each line the "
        "first string until \t or \n is considered a word.",
    )

    max_sequence_length: int = schema_utils.PositiveInteger(
        default=256,
        allow_none=False,
        description=
        "The maximum length (number of tokens) of the text. Texts that are longer than this value will be "
        "truncated, while texts that are shorter will be padded.",
    )

    most_common: int = schema_utils.PositiveInteger(
        default=20000,
        allow_none=False,
        description=
        "The maximum number of most common tokens in the vocabulary. If the data contains more than this "
        "amount, the most infrequent symbols will be treated as unknown.",
    )

    padding_symbol: str = schema_utils.String(
        default="<PAD>",
        allow_none=False,
        description=
        "The string used as the padding symbol for sequence features. Ignored for features using "
        "huggingface encoders, which have their own vocabulary.",
    )

    unknown_symbol: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The string used as the unknown symbol for sequence features. Ignored for features using "
        "huggingface encoders, which have their own vocabulary.",
    )

    padding: str = schema_utils.StringOptions(
        ["left", "right"],
        default="right",
        allow_none=False,
        description=
        "the direction of the padding. right and left are available options.",
    )

    lowercase: bool = schema_utils.Boolean(
        default=True,
        description=
        "If true, converts the string to lowercase before tokenizing.",
    )

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default="fill_with_const",
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in a text column",
    )

    fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The value to replace missing values with in case the missing_value_strategy is fill_with_const",
    )

    computed_fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )
예제 #11
0
class AudioPreprocessingConfig(schema_utils.BaseMarshmallowConfig):

    audio_file_length_limit_in_s: int = schema_utils.NonNegativeFloat(
        default=7.5,
        allow_none=False,
        description=
        "Float value that defines the maximum limit of the audio file in seconds. All files longer than "
        "this limit are cut off. All files shorter than this limit are padded with padding_value",
    )

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default=BACKFILL,
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in an audio column",
    )

    fill_value: float = schema_utils.NonNegativeFloat(
        default=None,
        allow_none=True,
        description=
        "The value to replace missing values with in case the missing_value_strategy is fill_with_const",
    )

    computed_fill_value: float = schema_utils.NonNegativeFloat(
        default=None,
        allow_none=True,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )

    in_memory: bool = schema_utils.Boolean(
        default=True,
        description=
        "Defines whether the audio dataset will reside in memory during the training process or will be "
        "dynamically fetched from disk (useful for large datasets). In the latter case a training batch "
        "of input audio will be fetched from disk each training iteration.",
    )

    padding_value: float = schema_utils.NonNegativeFloat(
        default=0.0,
        allow_none=False,
        description="Float value that is used for padding.")

    norm: str = schema_utils.StringOptions(
        ["per_file"],
        default=None,
        allow_none=True,
        description=
        "Normalization strategy for the audio files. If None, no normalization is performed. If "
        "per_file, z-norm is applied on a 'per file' level",
    )

    type: str = schema_utils.StringOptions(
        ["fbank", "group_delay", "raw", "stft", "stft_phase"],
        default="fbank",
        description="Defines the type of audio feature to be used.",
    )

    window_length_in_s: float = schema_utils.NonNegativeFloat(
        default=0.04,
        description=
        "Defines the window length used for the short time Fourier transformation. This is only needed if "
        "the audio_feature_type is 'raw'.",
    )

    window_shift_in_s: float = schema_utils.NonNegativeFloat(
        default=0.02,
        description=
        "Defines the window shift used for the short time Fourier transformation (also called "
        "hop_length). This is only needed if the audio_feature_type is 'raw'. ",
    )

    num_fft_points: float = schema_utils.NonNegativeFloat(
        default=None,
        description=
        "Defines the number of fft points used for the short time Fourier transformation"
    )

    window_type: str = schema_utils.StringOptions(
        ["bartlett", "blackman", "hamming", "hann"],
        default="hamming",
        description=
        "Defines the type window the signal is weighted before the short time Fourier transformation.",
    )

    num_filter_bands: int = schema_utils.PositiveInteger(
        default=80,
        description=
        "Defines the number of filters used in the filterbank. Only needed if audio_feature_type "
        "is 'fbank'",
    )
예제 #12
0
class ImagePreprocessingConfig(schema_utils.BaseMarshmallowConfig):

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default="backfill",
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in an image column",
    )

    fill_value: float = schema_utils.NonNegativeFloat(
        default=None,
        allow_none=True,
        description=
        "The maximum number of most common tokens to be considered. If the data contains more than this "
        "amount, the most infrequent tokens will be treated as unknown.",
    )

    computed_fill_value: float = schema_utils.NonNegativeFloat(
        default=None,
        allow_none=True,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )

    height: int = schema_utils.PositiveInteger(
        default=None,
        allow_none=True,
        description=
        "The image height in pixels. If this parameter is set, images will be resized to the specified "
        "height using the resize_method parameter. If None, images will be resized to the size of the "
        "first image in the dataset.",
    )

    width: int = schema_utils.PositiveInteger(
        default=None,
        allow_none=True,
        description=
        "The image width in pixels. If this parameter is set, images will be resized to the specified "
        "width using the resize_method parameter. If None, images will be resized to the size of the "
        "first image in the dataset.",
    )

    num_channels: int = schema_utils.PositiveInteger(
        default=None,
        allow_none=True,
        description=
        "Number of channels in the images. If specified, images will be read in the mode specified by the "
        "number of channels. If not specified, the number of channels will be inferred from the image "
        "format of the first valid image in the dataset.",
    )

    resize_method: str = schema_utils.StringOptions(
        ["crop_or_pad", "interpolate"],
        default="interpolate",
        allow_none=False,
        description="The method to use for resizing images.",
    )

    infer_image_num_channels: bool = schema_utils.Boolean(
        default=True,
        description=
        "If true, then the number of channels in the dataset is inferred from a sample of the first image "
        "in the dataset.",
    )

    infer_image_dimensions: bool = schema_utils.Boolean(
        default=True,
        description=
        "If true, then the height and width of images in the dataset will be inferred from a sample of "
        "the first image in the dataset. Each image that doesn't conform to these dimensions will be "
        "resized according to resize_method. If set to false, then the height and width of images in the "
        "dataset will be specified by the user.",
    )

    infer_image_max_height: int = schema_utils.PositiveInteger(
        default=256,
        allow_none=False,
        description=
        "If infer_image_dimensions is set, this is used as the maximum height of the images in "
        "the dataset.",
    )

    infer_image_max_width: int = schema_utils.PositiveInteger(
        default=256,
        allow_none=False,
        description=
        "If infer_image_dimensions is set, this is used as the maximum width of the images in "
        "the dataset.",
    )

    infer_image_sample_size: int = schema_utils.PositiveInteger(
        default=100,
        allow_none=False,
        description=
        "The sample size used for inferring dimensions of images in infer_image_dimensions.",
    )

    scaling: str = schema_utils.StringOptions(
        ["pixel_normalization", "pixel_standardization"],
        default="pixel_normalization",
        allow_none=False,
        description="The scaling strategy for pixel values in the image.",
    )

    in_memory: bool = schema_utils.Boolean(
        default=True,
        description=
        "Defines whether image dataset will reside in memory during the training process or will be "
        "dynamically fetched from disk (useful for large datasets). In the latter case a training batch "
        "of input images will be fetched from disk each training iteration.",
    )

    num_processes: int = schema_utils.PositiveInteger(
        default=1,
        allow_none=False,
        description=
        "Specifies the number of processes to run for preprocessing images.",
    )
예제 #13
0
class SequencePreprocessingConfig(schema_utils.BaseMarshmallowConfig):

    tokenizer: str = schema_utils.String(
        default="space",
        allow_none=False,
        description=
        "Defines how to map from the raw string content of the dataset column to a sequence of elements.",
    )

    vocab_file: str = schema_utils.String(
        default=None,
        allow_none=True,
        description=
        "Filepath string to a UTF-8 encoded file containing the sequence's vocabulary. On each line the "
        "first string until \t or \n is considered a word.",
    )

    max_sequence_length: int = schema_utils.PositiveInteger(
        default=256,
        allow_none=False,
        description=
        "The maximum length (number of tokens) of the text. Texts that are longer than this value will be "
        "truncated, while texts that are shorter will be padded.",
    )

    most_common: int = schema_utils.PositiveInteger(
        default=20000,
        allow_none=False,
        description=
        "The maximum number of most common tokens in the vocabulary. If the data contains more than this "
        "amount, the most infrequent symbols will be treated as unknown.",
    )

    padding_symbol: str = schema_utils.String(
        default="<PAD>",
        allow_none=False,
        description=
        "The string used as a padding symbol. This special token is mapped to the integer ID 0 in the "
        "vocabulary.",
    )

    unknown_symbol: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The string used as an unknown placeholder. This special token is mapped to the integer ID 1 in "
        "the vocabulary.",
    )

    padding: str = schema_utils.StringOptions(
        ["left", "right"],
        default="right",
        allow_none=False,
        description=
        "the direction of the padding. right and left are available options.",
    )

    lowercase: bool = schema_utils.Boolean(
        default=False,
        description=
        "If true, converts the string to lowercase before tokenizing.",
    )

    missing_value_strategy: str = schema_utils.StringOptions(
        MISSING_VALUE_STRATEGY_OPTIONS,
        default="fill_with_const",
        allow_none=False,
        description=
        "What strategy to follow when there's a missing value in a text column",
    )

    fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The value to replace missing values with in case the missing_value_strategy is fill_with_const",
    )

    computed_fill_value: str = schema_utils.String(
        default=strings_utils.UNKNOWN_SYMBOL,
        allow_none=False,
        description=
        "The internally computed fill value to replace missing values with in case the "
        "missing_value_strategy is fill_with_mode or fill_with_mean",
        parameter_metadata=PREPROCESSING_METADATA["computed_fill_value"],
    )