class EvaluationTokenizer(object): """A generic evaluation-time tokenizer, which leverages built-in tokenizers in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides lowercasing, punctuation removal and character tokenization, which are applied after sacreBLEU tokenization. Args: tokenizer_type (str): the type of sacreBLEU tokenizer to apply. lowercase (bool): lowercase the text. punctuation_removal (bool): remove punctuation (based on unicode category) from text. character_tokenization (bool): tokenize the text to characters. """ SPACE = chr(32) SPACE_ESCAPE = chr(9601) ALL_TOKENIZER_TYPES = ChoiceEnum(["none", "13a", "intl", "zh", "ja-mecab"]) def __init__( self, tokenizer_type: str = "13a", lowercase: bool = False, punctuation_removal: bool = False, character_tokenization: bool = False, ): from sacrebleu.tokenizers import TOKENIZERS assert tokenizer_type in TOKENIZERS, f"{tokenizer_type}, {TOKENIZERS}" self.lowercase = lowercase self.punctuation_removal = punctuation_removal self.character_tokenization = character_tokenization self.tokenizer = TOKENIZERS[tokenizer_type] @classmethod def remove_punctuation(cls, sent: str): """Remove punctuation based on Unicode category.""" return cls.SPACE.join( t for t in sent.split(cls.SPACE) if not all(unicodedata.category(c)[0] == "P" for c in t)) def tokenize(self, sent: str): tokenized = self.tokenizer()(sent) if self.punctuation_removal: tokenized = self.remove_punctuation(tokenized) if self.character_tokenization: tokenized = self.SPACE.join( list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE))) if self.lowercase: tokenized = tokenized.lower() return tokenized
class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"} ) self_target: bool = field(default=False, metadata={"help": "include self target"}) future_target: bool = field( default=False, metadata={"help": "include future target"} ) past_target: bool = field(default=False, metadata={"help": "include past target"}) add_bos_token: bool = field( default=False, metadata={"help": "prepend beginning of sentence token (<s>)"} ) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"} ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) # TODO common vars below add to parent seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") use_plasma_view: bool = II("common.use_plasma_view") plasma_path: str = II("common.plasma_path")
import torch.nn.functional as F from fairseq_stchde.dataclass import ChoiceEnum, FairseqDataclass from fairseq_stchde.models import BaseFairseqModel, register_model from fairseq_stchde.modules import ( Fp32GroupNorm, Fp32LayerNorm, GumbelVectorQuantizer, KmeansVectorQuantizer, TransposeLast, ) from fairseq_stchde.tasks import FairseqTask from fairseq_stchde.utils import buffered_arange logger = logging.getLogger(__name__) AGGREGATOR_CHOICES = ChoiceEnum(["cnn", "gru"]) PROJECT_FEATURES_CHOICES = ChoiceEnum(["none", "same", "new"]) ACTIVATION_CHOICES = ChoiceEnum(["relu", "gelu"]) VQ_TYPE_CHOICES = ChoiceEnum(["none", "gumbel", "kmeans"]) @dataclass class Wav2VecConfig(FairseqDataclass): prediction_steps: int = field( default=12, metadata={"help": "number of steps ahead to predict"}) sample_distance: Optional[int] = field( default=None, metadata={ "help": "sample distance from target. does not work properly with cross-sampling" },
class TranslationConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"}) left_pad_source: bool = field( default=True, metadata={"help": "pad the source on the left"}) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"}) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"}) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"}) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # options for reporting BLEU during validation eval_bleu: bool = field(default=False, metadata={"help": "evaluation with BLEU scores"}) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq_stchde.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={ "help": "args for building the tokenizer, if needed, as JSON string" }, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"})
class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"}) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"}) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."}) relu_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."}) decoder_embed_dim: int = field( default=512, metadata={"help": "decoder embedding dimension"}) decoder_output_dim: int = field( default=512, metadata={"help": "decoder output dimension"}) decoder_input_dim: int = field( default=512, metadata={"help": "decoder input dimension"}) decoder_ffn_embed_dim: int = field( default=2048, metadata={"help": "decoder embedding dimension for FFN"}) decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) decoder_attention_heads: int = field( default=8, metadata={"help": "num decoder attention heads"}) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"}) no_decoder_final_norm: bool = field( default=False, metadata={ "help": "don't add an extra layernorm after the last decoder block" }, ) adaptive_softmax_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0, metadata={ "help": "sets adaptive softmax dropout for the tail projections" }, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"}) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"}) character_embeddings: bool = field( default=False, metadata={ "help": "if set, uses character embedding convolutions to produce token embeddings" }, ) character_filters: str = field( default= "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", metadata={"help": "size of character embeddings"}, ) character_embedding_dim: int = field( default=4, metadata={"help": "size of character embeddings"}) char_embedder_highway_layers: int = field( default=2, metadata={ "help": "number of highway layers for character token embeddder" }, ) adaptive_input: bool = field( default=False, metadata={"help": "if set, uses adaptive input"}) adaptive_input_factor: float = field( default=4, metadata={"help": "adaptive input factor"}) adaptive_input_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive input cutoff points." }, ) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"}) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"}) checkpoint_activations: bool = field( default=False, metadata={"help": "checkpoint activations at each layer"}) offload_activations: bool = field( default=False, metadata={ "help": "move checkpointed activations to CPU after they are used." }, ) # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"}) decoder_layers_to_keep: Optional[str] = field( default=None, metadata={ "help": "which layers to *keep* when pruning as a comma-separated list" }, ) # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) quant_noise_pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) # config for Fully Sharded Data Parallel (FSDP) training min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ "help": ("minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed.") }) # options from other parts of the config add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("common.tpu")
class Wav2Vec2Config(FairseqDataclass): extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group norm with d " "groups in the first conv block, whereas layer_norm has layer norms in " "every block (meant to use with normalize=True)" }, ) encoder_layers: int = field( default=12, metadata={"help": "num encoder layers in the transformer"}) encoder_embed_dim: int = field( default=768, metadata={"help": "encoder embedding dimension"}) encoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "encoder embedding dimension for FFN"}) encoder_attention_heads: int = field( default=12, metadata={"help": "num encoder attention heads"}) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"}) # dropouts dropout: float = field( default=0.1, metadata={"help": "dropout probability for the transformer"}) attention_dropout: float = field( default=0.1, metadata={"help": "dropout probability for attention weights"}) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"}) encoder_layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={ "help": "dropout to apply to the features (after feat extr)" }, ) final_dim: int = field( default=0, metadata={ "help": "project final representations and targets to this many dimensions." "set to encoder_embed_dim is <= 0" }, ) layer_norm_first: bool = field( default=False, metadata={"help": "apply layernorm first in the transformer"}) conv_feature_layers: str = field( default= "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", metadata={ "help": "string describing convolutional feature extraction layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_bias: bool = field(default=False, metadata={"help": "include bias in conv encoder"}) logit_temp: float = field( default=0.1, metadata={"help": "temperature to divide logits by"}) quantize_targets: bool = field(default=False, metadata={"help": "use quantized targets"}) quantize_input: bool = field(default=False, metadata={"help": "use quantized inputs"}) same_quantizer: bool = field( default=False, metadata={"help": "use same quantizer for inputs and targets"}) target_glu: bool = field( default=False, metadata={"help": "adds projection + glu to targets"}) feature_grad_mult: float = field( default=1.0, metadata={"help": "multiply feature extractor var grads by this"}) latent_vars: int = field( default=320, metadata={ "help": "number of latent variables V in each group of the codebook" }, ) latent_groups: int = field( default=2, metadata={ "help": "number of groups G of latent variables in the codebook" }, ) latent_dim: int = field( default=0, metadata={ "help": "if > 0, uses this dimensionality for latent variables. " "otherwise uses final_dim / latent_groups" }, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, metadata={"help": "probability of replacing a token with mask"}) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length"}) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"}) mask_min_space: int = field( default=1, metadata={ "help": "min space between spans (if no overlap is enabled)" }, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}) mask_channel_min_space: int = field( default=1, metadata={ "help": "min space between spans (if no overlap is enabled)" }, ) # negative selection num_negatives: int = field( default=100, metadata={"help": "number of negative examples from the same sample"}, ) negatives_from_everywhere: bool = field( default=False, metadata={ "help": "sample negatives from everywhere, not just masked states" }, ) cross_sample_negatives: int = field( default=0, metadata={"help": "number of negative examples from the any sample"}) codebook_negatives: int = field( default=0, metadata={"help": "number of negative examples codebook"}) # positional embeddings conv_pos: int = field( default=128, metadata={ "help": "number of filters for convolutional positional embeddings" }, ) conv_pos_groups: int = field( default=16, metadata={ "help": "number of groups for convolutional positional embedding" }, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), metadata={ "help": "temperature for latent variable sampling. " "can be tuple of 3 values (start, end, decay)" }, )
from fairseq_stchde.dataclass import ChoiceEnum, FairseqDataclass from fairseq_stchde.models import BaseFairseqModel, register_model from fairseq_stchde.modules import ( Fp32GroupNorm, Fp32LayerNorm, GradMultiply, GumbelVectorQuantizer, LayerNorm, MultiheadAttention, SamePad, TransposeLast, ) from fairseq_stchde.modules.transformer_sentence_encoder import init_bert_params from fairseq_stchde.utils import buffered_arange EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"]) MASKING_DISTRIBUTION_CHOICES = ChoiceEnum( ["static", "uniform", "normal", "poisson"]) @dataclass class Wav2Vec2Config(FairseqDataclass): extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group norm with d " "groups in the first conv block, whereas layer_norm has layer norms in " "every block (meant to use with normalize=True)" }, )
# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import torch from fairseq_stchde import utils from fairseq_stchde.data import LanguagePairDataset from fairseq_stchde.dataclass import ChoiceEnum from fairseq_stchde.tasks import register_task from fairseq_stchde.tasks.translation import TranslationConfig, TranslationTask, load_langpair_dataset from fairseq_stchde.utils import new_arange NOISE_CHOICES = ChoiceEnum(["random_delete", "random_mask", "no_noise", "full_mask"]) @dataclass class TranslationLevenshteinConfig(TranslationConfig): noise: NOISE_CHOICES = field( default="random_delete", metadata={ "help": "type of noise" }, ) @register_task("translation_lev", dataclass=TranslationLevenshteinConfig) class TranslationLevenshteinTask(TranslationTask): """ Translation (Sequence Generation) task for Levenshtein Transformer See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_.
NumelDataset, PadDataset, PrependTokenDataset, StripTokenDataset, TokenBlockDataset, TruncatedDictionary, data_utils, ) from fairseq_stchde.data.indexed_dataset import get_available_dataset_impl from fairseq_stchde.data.shorten_dataset import maybe_shorten_dataset from fairseq_stchde.dataclass import ChoiceEnum, FairseqDataclass from fairseq_stchde.tasks import LegacyFairseqTask, register_task from omegaconf import II SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) logger = logging.getLogger(__name__) @dataclass class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. "