def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) try: from fairscale.nn import Pipe except ImportError: raise ImportError("Please install fairscale with: pip install fairscale") self.use_pipeline = encoder_module_list is not None if not self.use_pipeline: self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) self.encoder_layers = nn.Sequential(*[TransformerEncoderLayer(args) for i in range(args.encoder_layers)]) if isinstance(embed_tokens, nn.ModuleList): emb_dim = sum(e.embedding_dim for e in embed_tokens) else: emb_dim = embed_tokens.embedding_dim self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) else: encoder_balance = utils.eval_str_list( args.pipeline_encoder_balance, type=int ) encoder_devices = utils.eval_str_list( args.pipeline_encoder_devices, type=int ) assert sum(encoder_balance) == len(encoder_module_list), ( f"Sum of encoder_balance={encoder_balance} is not equal " + f"to num_encoder_modules={len(encoder_module_list)}" ) self.model = Pipe( module=nn.Sequential(*encoder_module_list), balance=encoder_balance, devices=encoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, )
def build_model(cls, args, task): encoder, decoder = cls.build_model_base(args, task) return PipelineParallelTransformerModel( encoder=encoder, decoder=decoder, balance=utils.eval_str_list(args.pipeline_balance, type=int), devices=utils.eval_str_list(args.pipeline_devices, type=int), chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, )
def get_decoder(lang): if lang not in lang_decoders: if shared_decoder_embed_tokens is not None: decoder_embed_tokens = shared_decoder_embed_tokens else: decoder_embed_tokens = build_embedding( task.dicts[lang], cfg.decoder_embed_dim, cfg.decoder_embed_path) lang_decoders[lang] = RNNDecoder( dictionary=task.dicts[lang], embed_dim=cfg.decoder_embed_dim, hidden_size=cfg.decoder_hidden_size, out_embed_dim=cfg.decoder_out_embed_dim, num_layers=cfg.decoder_layers, attention_type=cfg.attention_type, dropout_in=(cfg.decoder_dropout_in if cfg.decoder_dropout_in >= 0 else cfg.dropout), dropout_out=(cfg.decoder_dropout_out if cfg.decoder_dropout_out >= 0 else cfg.dropout), rnn_type=cfg.rnn_type, encoder_output_units=cfg.encoder_hidden_size, pretrained_embed=decoder_embed_tokens, share_input_output_embed=cfg. share_decoder_input_output_embed, adaptive_softmax_cutoff=(utils.eval_str_list( cfg.adaptive_softmax_cutoff, type=int) if cfg.criterion == "adaptive_loss" else None), max_target_positions=cfg.max_target_positions, residuals=False, ) return lang_decoders[lang]
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off TransformerModel.add_args(parser) parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution\'s out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution\'s kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution\'s strides") parser.add_argument("--encoder-transformer-context", type=str, metavar="EXPR", help="left/right context for time-restricted self-attention; " "can be None or a tuple of two non-negative integers/None") parser.add_argument("--decoder-input-dim", type=int, metavar="N", help="decoder input dimension (extra linear layer " "if different from decoder embed dim)") # Scheduled sampling options parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p), metavar="P_1,P_2,...,P_N", default=[1.0], help="scheduled sampling probabilities of sampling the truth " "labels for N epochs starting from --start-schedule-sampling-epoch; " "all later epochs using P_N") parser.add_argument("--start-scheduled-sampling-epoch", type=int, metavar="N", default=1, help="start scheduled sampling from the specified epoch")
def add_optimization_args(parser): group = parser.add_argument_group("Optimization") # fmt: off group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument( '--stop-time-hours', default=0, type=float, metavar='N', help='force stop training after specified cumulative time (if >0)') group.add_argument('--clip-norm', default=0.0, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument( '--update-freq', default='1', metavar='N1,N2,...,N_K', type=lambda uf: eval_str_list(uf, type=int), help='update parameters every N_i batches, when in epoch i') group.add_argument( '--lr', '--learning-rate', default='0.25', type=eval_str_list, metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument( '--min-lr', default=-1, type=float, metavar='LR', help='stop training when the learning rate reaches this minimum') group.add_argument( '--use-bmuf', default=False, action='store_true', help= 'specify global optimizer for syncing models on different GPUs/shards') # fmt: on return group
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if getattr(args, "max_target_positions", None) is not None: max_target_positions = args.max_target_positions else: max_target_positions = getattr(args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim) if args.share_decoder_input_output_embed: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError( "--share-decoder-input-output-embeddings requires a joint dictionary" ) if args.decoder_embed_dim != args.decoder_out_embed_dim: raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim") decoder = LSTMDecoder( dictionary=task.dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention= False, # decoder-only language model doesn't support attention encoder_output_units=0, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=(utils.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), max_target_positions=max_target_positions, residuals=args.residuals, ) return cls(decoder)
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if hasattr(args, "max_target_positions") and not hasattr( args, "tokens_per_sample"): args.tokens_per_sample = args.max_target_positions decoder = FConvDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, convolutions=eval(args.decoder_layers), out_embed_dim=args.decoder_embed_dim, attention=eval(args.decoder_attention), dropout=args.dropout, max_positions=args.tokens_per_sample, share_embed=False, positional_embeddings=False, adaptive_softmax_cutoff=(utils.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), adaptive_softmax_dropout=args.adaptive_softmax_dropout, ) return FConvLanguageModel(decoder)
def build_decoder(cls, args, tgt_dict, embed_tokens): #return TransformerDecoder( # args, # tgt_dict, # embed_tokens, # no_encoder_attn=getattr(args, "no_cross_attention", False), #) return GRUDecoder( args=args, dictionary=tgt_dict, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=not getattr(args, "no_cross_attention", False), encoder_output_units=getattr(args, "encoder_embed_dim", None), pretrained_embed=embed_tokens, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=(utils.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), max_target_positions=args.max_target_positions, residuals=False, )
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if getattr(args, "max_target_positions", None) is not None: max_target_positions = args.max_target_positions else: max_target_positions = getattr(args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.is_wordlm and hasattr(task, "word_dictionary"): dictionary = task.word_dictionary elif isinstance(task, SpeechRecognitionEspressoTask): dictionary = task.target_dictionary else: dictionary = task.source_dictionary # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, dictionary, args.decoder_embed_dim) # one last double check of parameter combinations if args.share_embed and (args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( "--share-embed requires " "--decoder-embed-dim to match --decoder-out-embed-dim") if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False decoder = SpeechLSTMDecoder( dictionary=dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attn_type=None, encoder_output_units=0, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_embed, adaptive_softmax_cutoff=(utils.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), max_target_positions=max_target_positions, ) return cls(decoder, args)
def build_output_projection(self, args, dictionary, embed_tokens): if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 ) num_base_layers = getattr(args, "base_layers", 0) for i in range(num_base_layers): self.layers.insert(((i+1) * args.decoder_layers) // (num_base_layers + 1), BaseLayer(args))
def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig): from fairseq import utils balance_exists = ( cfg.pipeline_balance is not None or cfg.pipeline_encoder_balance is not None or cfg.pipeline_decoder_balance is not None ) devices_exist = ( cfg.pipeline_devices is not None or cfg.pipeline_encoder_devices is not None or cfg.pipeline_decoder_devices is not None ) if not balance_exists: raise ValueError( "--pipeline-balance is currently required for pipeline model parallelism" ) if not devices_exist: raise ValueError( "--pipeline-devices is currently required for pipeline model parallelism" ) cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) if cfg.pipeline_devices is not None: cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) num_pipeline_devices = len(set(cfg.pipeline_devices)) else: cfg.pipeline_encoder_devices = utils.eval_str_list( cfg.pipeline_encoder_devices, type=int ) cfg.pipeline_decoder_devices = utils.eval_str_list( cfg.pipeline_decoder_devices, type=int ) num_pipeline_devices = len( set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices) ) gpus_per_node = torch.cuda.device_count() assert ( gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0 ), ( "the number of unique device IDs in --pipeline-devices must evenly divide " "the number of GPUs per node (multi-node pipelining is not yet supported)" ) num_pipelines_per_node = gpus_per_node // num_pipeline_devices return num_pipeline_devices, num_pipelines_per_node
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, decoder_module_list=None, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) try: from fairscale.nn import Pipe except ImportError: raise ImportError( "Please install fairscale with: pip install fairscale") if decoder_module_list is None: embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) layers = [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] decoder_output_layer = TransformerDecoderOutputLayer( args, embed_tokens, dictionary) decoder_module_list = [embedding_layer ] + layers + [decoder_output_layer] self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None if self.use_pipeline: decoder_balance = utils.eval_str_list( args.pipeline_decoder_balance, type=int) decoder_devices = utils.eval_str_list( args.pipeline_decoder_devices, type=int) assert sum(decoder_balance) == len(decoder_module_list), ( f"Sum of decoder_balance={decoder_balance} is not equal " + f"to num_decoder_modules={len(decoder_module_list)}") self.model = Pipe( module=nn.Sequential(*decoder_module_list), balance=decoder_balance, devices=decoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) else: self.embedding_layer = decoder_module_list[0] self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1]) self.decoder_output_layer = decoder_module_list[-1]
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, decoder_module_list=None, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) import_pipe() self.use_pipeline = decoder_module_list is not None if not self.use_pipeline: self.embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) self.decoder_layers = nn.Sequential(*[ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.decoder_output_layer = TransformerDecoderOutputLayer( args, embed_tokens, dictionary ) else: decoder_balance = utils.eval_str_list( args.pipeline_decoder_balance, type=int ) decoder_devices = utils.eval_str_list( args.pipeline_decoder_devices, type=int ) assert sum(decoder_balance) == len(decoder_module_list), ( f"Sum of decoder_balance={decoder_balance} is not equal " + f"to num_decoder_modules={len(decoder_module_list)}" ) if TORCH_PIPE: self.model = Pipe( module=partition_model(nn.Sequential(*decoder_module_list), decoder_balance, decoder_devices), chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) else: self.model = Pipe( module=nn.Sequential(*decoder_module_list), balance=decoder_balance, devices=decoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, )
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = args.tokens_per_sample if getattr(args, "max_target_positions", None) is None: args.max_target_positions = args.tokens_per_sample if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, utils.eval_str_list(args.adaptive_input_cutoff, type=int), ) else: embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert (args.adaptive_softmax_cutoff == args.adaptive_input_cutoff ), "{} != {}".format(args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) assert args.decoder_input_dim == args.decoder_output_dim decoder = LightConvDecoder( args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False, ) return LightConvLanguageModel(decoder)
def add_distributed_training_args(parser, default_world_size=None): group = parser.add_argument_group("Distributed training") # fmt: off if default_world_size is None: default_world_size = max(1, torch.cuda.device_count()) group.add_argument( '--distributed-world-size', type=int, metavar='N', default=default_world_size, help='total number of GPUs across all nodes (default: all visible GPUs)' ) group.add_argument('--distributed-rank', default=0, type=int, help='rank of the current worker') group.add_argument('--distributed-backend', default='nccl', type=str, help='distributed backend') group.add_argument( '--distributed-init-method', default=None, type=str, help='typically tcp://hostname:port that will be used to ' 'establish initial connetion') group.add_argument( '--distributed-port', default=-1, type=int, help='port number (not required if using --distributed-init-method)') group.add_argument( '--device-id', '--local_rank', default=0, type=int, help='which GPU to use (usually configured automatically)') group.add_argument( '--distributed-no-spawn', action='store_true', help='do not spawn multiple processes even if multiple GPUs are visible' ) group.add_argument( '--distributed-num-procs', default=None, type=int, help='number of processes to spawn (usually configured automatically)') # "c10d" is PyTorch's DDP implementation and provides the fastest # training. "no_c10d" is a more robust, but slightly slower DDP # implementation. Try this if you get warning messages about # inconsistent gradients between workers, or if some of your model # parameters are not always used. group.add_argument('--ddp-backend', default='c10d', type=str, choices=['c10d', 'no_c10d'], help='DistributedDataParallel backend') group.add_argument('--bucket-cap-mb', default=25, type=int, metavar='MB', help='bucket size for reduction') group.add_argument( '--fix-batches-to-gpus', action='store_true', help='don\'t shuffle batches between GPUs; this reduces overall ' 'randomness and may affect precision but avoids the cost of ' 're-reading the data') group.add_argument( '--find-unused-parameters', default=False, action='store_true', help='disable unused parameter detection (not applicable to ' 'no_c10d ddp-backend') group.add_argument('--fast-stat-sync', default=False, action='store_true', help='[deprecated] this is now defined per Criterion') group.add_argument( '--broadcast-buffers', default=False, action='store_true', help='Copy non-trainable parameters between GPUs, such as ' 'batchnorm population statistics') group.add_argument('--distributed-wrapper', default='DDP', type=str, choices=['DDP', 'SlowMo'], help='DistributedDataParallel backend') # Add arguments for SlowMo - these will be used when SlowMo is enabled via above group.add_argument( '--slowmo-momentum', default=None, type=float, help='SlowMo momentum term; by default use 0.0 for 16 GPUs, ' '0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs') group.add_argument('--slowmo-algorithm', default='LocalSGD', choices=['LocalSGD', 'SGP'], help='whether to use LocalSGD or SGP') group.add_argument('--localsgd-frequency', default=3, type=int, help='Local SGD allreduce frequency') group.add_argument( '--nprocs-per-node', type=int, metavar='N', default=max(1, torch.cuda.device_count()), help= 'number of GPUs in each node. An allreduce operation across GPUs in ' 'a node is very fast. Hence, we do allreduce across GPUs in a node, ' 'and gossip across different nodes') # Pipeline Parallel Arguments group.add_argument( '--pipeline-model-parallel', default=False, action='store_true', help='if set, use pipeline model parallelism across GPUs') group.add_argument( '--pipeline-balance', metavar='N1,N2,...,N_K', type=lambda x: eval_str_list(x, type=int), help='partition the model into N_K pieces, where each piece ' 'contains N_i layers. The sum(args.pipeline_balance) ' 'should equal the total number of layers in the model') group.add_argument( '--pipeline-devices', metavar='N1,N2,...,N_K', type=lambda x: eval_str_list(x, type=int), help='a list of device indices indicating which device to place ' 'each of the N_K partitions. The length of this list should ' 'equal the length of the --pipeline-balance argument') group.add_argument('--pipeline-chunks', type=int, metavar='N', help='microbatch count for pipeline model parallelism') group.add_argument( '--pipeline-checkpoint', type=str, metavar='STR', choices=['always', 'never', 'except_last'], default='never', help='checkpointing mode for pipeline model parallelism') # Add argument for ZeRO sharding of OptimizerState(os), gradients(g) and parameters(p) group.add_argument('--zero-sharding', default='none', type=str, choices=['none', 'os'], help='ZeRO sharding') # fmt: on return group
def build_model(cls, args, task): """Build a new model instance.""" max_source_positions = getattr( args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS ) max_target_positions = getattr( args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim, ) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim ): raise ValueError( "--share-decoder-input-output-embed requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int ) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int ) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int ) logger.info( "input feature dimension: {}, channels: {}".format( task.feat_dim, task.feat_in_channels ) ) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ( ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None ) rnn_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride rnn_encoder_input_size = (rnn_encoder_input_size + s - 1) // s rnn_encoder_input_size *= out_channels[-1] else: rnn_encoder_input_size = task.feat_dim if args.encoder_multilayer_rnn_as_single_module and args.encoder_rnn_residual: args.encoder_rnn_residual = False logger.info( "--encoder-rnn-residual is set to False when --encoder-multilayer-rnn-as-single-module=True" ) scheduled_sampling_rate_scheduler = ScheduledSamplingRateScheduler( args.scheduled_sampling_probs, args.start_scheduled_sampling_epoch, ) encoder = SpeechLSTMEncoder( pre_encoder=conv_layers, input_size=rnn_encoder_input_size, hidden_size=args.encoder_rnn_hidden_size, num_layers=args.encoder_rnn_layers, dropout_in=args.encoder_rnn_dropout_in, dropout_out=args.encoder_rnn_dropout_out, bidirectional=args.encoder_rnn_bidirectional, residual=args.encoder_rnn_residual, src_bucketed=(getattr(task.cfg, "num_batch_buckets", 0) > 0), max_source_positions=max_source_positions, multilayer_rnn_as_single_module=args.encoder_multilayer_rnn_as_single_module, ) decoder = SpeechLSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, encoder_output_units=encoder.output_units, attn_type=args.attention_type, attn_dim=args.attention_dim, need_attn=args.need_attention, residual=args.decoder_rnn_residual, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion_name == "adaptive_loss" else None ), max_target_positions=max_target_positions, scheduled_sampling_rate_scheduler=scheduled_sampling_rate_scheduler, ) pretrained_lm = None if args.pretrained_lm_checkpoint: logger.info( "loading pretrained LM from {}".format(args.pretrained_lm_checkpoint) ) pretrained_lm = checkpoint_utils.load_model_ensemble( args.pretrained_lm_checkpoint, task=task )[0][0] pretrained_lm.make_generation_fast_() # freeze pretrained model for param in pretrained_lm.parameters(): param.requires_grad = False return cls(encoder, decoder, pretrained_lm)
def build_model(cls, cfg: RNNModelConfig, task): """Build a new model instance.""" if cfg.encoder_layers != cfg.decoder_layers: raise ValueError("--encoder-layers must match --decoder-layers") max_source_positions = getattr( cfg, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS ) max_target_positions = getattr( cfg, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if cfg.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( cfg.encoder_embed_path, task.source_dictionary, cfg.encoder_embed_dim ) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = torch.nn.Embedding( num_embeddings, cfg.encoder_embed_dim, task.source_dictionary.pad() ) if cfg.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError("--share-all-embeddings requires a joint dictionary") if cfg.decoder_embed_path and ( cfg.decoder_embed_path != cfg.encoder_embed_path ): raise ValueError( "--share-all-embed not compatible with --decoder-embed-path" ) if cfg.encoder_embed_dim != cfg.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to " "match --decoder-embed-dim" ) pretrained_decoder_embed = pretrained_encoder_embed cfg.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if cfg.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( cfg.decoder_embed_path, task.target_dictionary, cfg.decoder_embed_dim, ) # one last double check of parameter combinations if cfg.share_decoder_input_output_embed and ( cfg.decoder_embed_dim != cfg.decoder_out_embed_dim ): raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) if cfg.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if cfg.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False encoder = RNNEncoder( rnn_type=cfg.rnn_type, dictionary=task.source_dictionary, embed_dim=cfg.encoder_embed_dim, hidden_size=cfg.encoder_hidden_size, num_layers=cfg.encoder_layers, dropout_in=(cfg.encoder_dropout_in if cfg.encoder_dropout_in >= 0 else cfg.dropout), dropout_out=(cfg.encoder_dropout_out if cfg.encoder_dropout_out >= 0 else cfg.dropout), bidirectional=cfg.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, max_source_positions=max_source_positions, ) uses_attention = getattr(cfg, 'attention_type', "none") != "none" attention_type = getattr(cfg, 'attention_type', "luong-dot") if uses_attention else None decoder = RNNDecoder( rnn_type=cfg.rnn_type, dictionary=task.target_dictionary, embed_dim=cfg.decoder_embed_dim, hidden_size=cfg.decoder_hidden_size, out_embed_dim=cfg.decoder_out_embed_dim, num_layers=cfg.decoder_layers, dropout_in=(cfg.decoder_dropout_in if cfg.decoder_dropout_in >= 0 else cfg.dropout), dropout_out=(cfg.decoder_dropout_out if cfg.decoder_dropout_out >= 0 else cfg.dropout), attention=uses_attention, attention_type=attention_type, encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=cfg.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int) if cfg.criterion == "adaptive_loss" else None ), max_target_positions=max_target_positions, residuals=False, ) return cls(encoder, decoder)
def infer_init_method(args, force_distributed=False): if args.distributed_init_method is not None or getattr(args, 'tpu', False): return if args.pipeline_model_parallel: balance_exists = args.pipeline_balance is not None or \ args.pipeline_encoder_balance is not None or \ args.pipeline_decoder_balance is not None devices_exist = args.pipeline_devices is not None or \ args.pipeline_encoder_devices is not None or \ args.pipeline_decoder_devices is not None if not balance_exists: raise ValueError( '--pipeline-balance is currently required for pipeline model parallelism' ) if not devices_exist: raise ValueError( '--pipeline-devices is currently required for pipeline model parallelism' ) args.pipeline_balance = utils.eval_str_list(args.pipeline_balance, type=int) if args.pipeline_devices is not None: args.pipeline_devices = utils.eval_str_list(args.pipeline_devices, type=int) num_pipeline_devices = len(set(args.pipeline_devices)) else: args.pipeline_encoder_devices = utils.eval_str_list( args.pipeline_encoder_devices, type=int) args.pipeline_decoder_devices = utils.eval_str_list( args.pipeline_decoder_devices, type=int) num_pipeline_devices = len( set(args.pipeline_encoder_devices + args.pipeline_decoder_devices)) gpus_per_node = torch.cuda.device_count() assert gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0, ( 'the number of unique device IDs in --pipeline-devices must evenly divide ' 'the number of GPUs per node (multi-node pipelining is not yet supported)' ) num_pipelines_per_node = gpus_per_node // num_pipeline_devices # support torch.distributed.launch if all(key in os.environ for key in ['MASTER_ADDR', 'MASTER_PORT', 'WORLD_SIZE', 'RANK']): args.distributed_init_method = 'env://' args.distributed_world_size = int(os.environ['WORLD_SIZE']) args.distributed_rank = int(os.environ['RANK']) # processes are created by torch.distributed.launch args.distributed_no_spawn = True # we can determine the init method automatically for Slurm elif args.distributed_port > 0: node_list = os.environ.get('SLURM_STEP_NODELIST') if node_list is None: node_list = os.environ.get('SLURM_JOB_NODELIST') if node_list is not None: try: hostnames = subprocess.check_output( ['scontrol', 'show', 'hostnames', node_list]) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=hostnames.split()[0].decode('utf-8'), port=args.distributed_port, ) nnodes = int(os.environ.get('SLURM_NNODES')) ntasks_per_node = os.environ.get('SLURM_NTASKS_PER_NODE') if ntasks_per_node is not None: ntasks_per_node = int(ntasks_per_node) else: ntasks = int(os.environ.get('SLURM_NTASKS')) nnodes = int(os.environ.get('SLURM_NNODES')) assert ntasks % nnodes == 0 ntasks_per_node = int(ntasks / nnodes) if ntasks_per_node == 1: gpus_per_node = torch.cuda.device_count() node_id = int(os.environ.get('SLURM_NODEID')) args.distributed_rank = node_id * gpus_per_node args.distributed_world_size = nnodes * gpus_per_node elif args.pipeline_model_parallel: assert ntasks_per_node == num_pipelines_per_node, ( 'SLURM --ntasks-per-node must match number of pipelines per ' 'node (={})'.format(num_pipelines_per_node)) args.distributed_no_spawn = True # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on # the first node, [1, 2] on the second node, etc. This # matches torch.distributed.launch. node_id = int(os.environ.get('SLURM_NODEID')) local_id = int(os.environ.get('SLURM_LOCALID')) args.distributed_rank = node_id * num_pipelines_per_node + local_id # In the above example, device_id will always be in [0, 1], # which also matches torch.distributed.launch. args.device_id = local_id # We also want to set distributed_world_size to be the total # number of pipelines across all nodes. args.distributed_world_size = nnodes * num_pipelines_per_node else: assert ntasks_per_node == args.distributed_world_size // nnodes args.distributed_no_spawn = True args.distributed_rank = int(os.environ.get('SLURM_PROCID')) args.device_id = int(os.environ.get('SLURM_LOCALID')) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError: # Slurm is not installed pass elif args.distributed_world_size > 1 or force_distributed: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) if args.pipeline_model_parallel: if not args.distributed_no_spawn: # When distributed_no_spawn is False, we expect distributed_rank and # distributed_world_size to be based on the total number of GPUs, so # we need to correct them to be based on the number of pipelines. assert args.distributed_world_size % num_pipeline_devices == 0 args.distributed_world_size = args.distributed_world_size // num_pipeline_devices # In the case of 4-way MP on nodes with 8 GPUs, we want # distributed_rank to be the starting GPU index for each pipeline # i.e., 0, 2, ... assert args.distributed_rank % gpus_per_node == 0 assert args.distributed_rank % num_pipeline_devices == 0 args.distributed_rank = args.distributed_rank // num_pipeline_devices # launch one process per pipeline args.distributed_num_procs = num_pipelines_per_node # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 # and 4, indicating the starting device IDs for each pipeline args.device_id *= num_pipeline_devices if args.device_id > 0: # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 # GPU node), we need to adjust pipeline_devices accordingly logger.debug("setting CUDA device={} on rank {}".format( args.device_id, args.distributed_rank)) torch.cuda.set_device(args.device_id) args.pipeline_devices = [ args.device_id + d for d in args.pipeline_devices ] logger.info( "setting pipeline_devices={} on rank {}".format( args.pipeline_devices, args.distributed_rank), ) elif not args.distributed_no_spawn: args.distributed_num_procs = min( torch.cuda.device_count(), args.distributed_world_size, )
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', default=0.1, type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument( '--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--decoder-normalize-before', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', help='share decoder input and output embeddings') parser.add_argument( '--character-embeddings', default=False, action='store_true', help= 'if set, uses character embedding convolutions to produce token embeddings' ) parser.add_argument( '--character-filters', type=str, metavar='LIST', default= '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, help='size of character embeddings') parser.add_argument( '--char-embedder-highway-layers', type=int, metavar='N', default=2, help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', default=False, action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument( '--tie-adaptive-weights', action='store_true', help= 'if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument( '--tie-adaptive-proj', action='store_true', help= 'if set, ties the projection weights of adaptive softmax and adaptive input' ) parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') """LightConv and DynamicConv arguments""" parser.add_argument( '--decoder-kernel-size-list', type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--decoder-glu', type=utils.eval_bool, help='glu after in proj') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=utils.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights')
def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False): if cfg.distributed_init_method is not None or cfg.tpu: return if cfg.pipeline_model_parallel: balance_exists = (cfg.pipeline_balance is not None or cfg.pipeline_encoder_balance is not None or cfg.pipeline_decoder_balance is not None) devices_exist = (cfg.pipeline_devices is not None or cfg.pipeline_encoder_devices is not None or cfg.pipeline_decoder_devices is not None) if not balance_exists: raise ValueError( "--pipeline-balance is currently required for pipeline model parallelism" ) if not devices_exist: raise ValueError( "--pipeline-devices is currently required for pipeline model parallelism" ) cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) if cfg.pipeline_devices is not None: cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) num_pipeline_devices = len(set(cfg.pipeline_devices)) else: cfg.pipeline_encoder_devices = utils.eval_str_list( cfg.pipeline_encoder_devices, type=int) cfg.pipeline_decoder_devices = utils.eval_str_list( cfg.pipeline_decoder_devices, type=int) num_pipeline_devices = len( set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)) gpus_per_node = torch.cuda.device_count() assert ( gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0 ), ("the number of unique device IDs in --pipeline-devices must evenly divide " "the number of GPUs per node (multi-node pipelining is not yet supported)" ) num_pipelines_per_node = gpus_per_node // num_pipeline_devices # support torch.distributed.launch if all(key in os.environ for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]): cfg.distributed_init_method = "env://" cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) cfg.distributed_rank = int(os.environ["RANK"]) # processes are created by torch.distributed.launch cfg.distributed_no_spawn = True # we can determine the init method automatically for Slurm elif cfg.distributed_port > 0: node_list = os.environ.get("SLURM_STEP_NODELIST") if node_list is None: node_list = os.environ.get("SLURM_JOB_NODELIST") if node_list is not None: try: hostnames = subprocess.check_output( ["scontrol", "show", "hostnames", node_list]) cfg.distributed_init_method = "tcp://{host}:{port}".format( host=hostnames.split()[0].decode("utf-8"), port=cfg.distributed_port, ) nnodes = int(os.environ.get("SLURM_NNODES")) ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") if ntasks_per_node is not None: ntasks_per_node = int(ntasks_per_node) else: ntasks = int(os.environ.get("SLURM_NTASKS")) nnodes = int(os.environ.get("SLURM_NNODES")) assert ntasks % nnodes == 0 ntasks_per_node = int(ntasks / nnodes) if ntasks_per_node == 1: gpus_per_node = torch.cuda.device_count() node_id = int(os.environ.get("SLURM_NODEID")) cfg.distributed_rank = node_id * gpus_per_node cfg.distributed_world_size = nnodes * gpus_per_node elif cfg.pipeline_model_parallel: assert ntasks_per_node == num_pipelines_per_node, ( "SLURM --ntasks-per-node must match number of pipelines per " "node (={})".format(num_pipelines_per_node)) cfg.distributed_no_spawn = True # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on # the first node, [1, 2] on the second node, etc. This # matches torch.distributed.launch. node_id = int(os.environ.get("SLURM_NODEID")) local_id = int(os.environ.get("SLURM_LOCALID")) cfg.distributed_rank = node_id * num_pipelines_per_node + local_id # In the above example, device_id will always be in [0, 1], # which also matches torch.distributed.launch. cfg.device_id = local_id # We also want to set distributed_world_size to be the total # number of pipelines across all nodes. cfg.distributed_world_size = nnodes * num_pipelines_per_node else: assert ntasks_per_node == cfg.distributed_world_size // nnodes cfg.distributed_no_spawn = True cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) cfg.device_id = int(os.environ.get("SLURM_LOCALID")) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError: # Slurm is not installed pass elif cfg.distributed_world_size > 1 or force_distributed: # fallback for single node with multiple GPUs assert ( cfg.distributed_world_size <= torch.cuda.device_count() ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices" port = random.randint(10000, 20000) cfg.distributed_init_method = "tcp://localhost:{port}".format( port=port) if cfg.pipeline_model_parallel: if not cfg.distributed_no_spawn: # When distributed_no_spawn is False, we expect distributed_rank and # distributed_world_size to be based on the total number of GPUs, so # we need to correct them to be based on the number of pipelines. assert cfg.distributed_world_size % num_pipeline_devices == 0 cfg.distributed_world_size = (cfg.distributed_world_size // num_pipeline_devices) # In the case of 4-way MP on nodes with 8 GPUs, we want # distributed_rank to be the starting GPU index for each pipeline # i.e., 0, 2, ... assert cfg.distributed_rank % gpus_per_node == 0 assert cfg.distributed_rank % num_pipeline_devices == 0 with open_dict(cfg): cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices # launch one process per pipeline cfg.distributed_num_procs = num_pipelines_per_node # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 # and 4, indicating the starting device IDs for each pipeline cfg.device_id *= num_pipeline_devices if cfg.device_id > 0: # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 # GPU node), we need to adjust pipeline_devices accordingly logger.debug("setting CUDA device={} on rank {}".format( cfg.device_id, cfg.distributed_rank)) torch.cuda.set_device(cfg.device_id) with open_dict(cfg): cfg.pipeline_devices = [ cfg.device_id + d for d in cfg.pipeline_devices ] logger.info("setting pipeline_devices={} on rank {}".format( cfg.pipeline_devices, cfg.distributed_rank)) elif not cfg.distributed_no_spawn: with open_dict(cfg): cfg.distributed_num_procs = min(torch.cuda.device_count(), cfg.distributed_world_size)
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True ): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvDecoderLayer( args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i] ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--attention-dropout", default=0.0, type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", default=0.0, type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--input-dropout", type=float, metavar="D", help="dropout probability of the inputs", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-output-dim", type=int, metavar="N", help="decoder output dimension", ) parser.add_argument("--decoder-input-dim", type=int, metavar="N", help="decoder input dimension") parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--decoder-normalize-before", default=False, action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ) parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) parser.add_argument( "--adaptive-softmax-factor", type=float, metavar="N", help="adaptive input factor", ) parser.add_argument( "--no-token-positional-embeddings", default=False, action="store_true", help= "if set, disables positional embeddings (outside self attention)", ) parser.add_argument( "--share-decoder-input-output-embed", default=False, action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--character-embeddings", default=False, action="store_true", help= "if set, uses character embedding convolutions to produce token embeddings", ) parser.add_argument( "--character-filters", type=str, metavar="LIST", default= "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", help="size of character embeddings", ) parser.add_argument( "--character-embedding-dim", type=int, metavar="N", default=4, help="size of character embeddings", ) parser.add_argument( "--char-embedder-highway-layers", type=int, metavar="N", default=2, help="number of highway layers for character token embeddder", ) parser.add_argument( "--adaptive-input", default=False, action="store_true", help="if set, uses adaptive input", ) parser.add_argument( "--adaptive-input-factor", type=float, metavar="N", help="adaptive input factor", ) parser.add_argument( "--adaptive-input-cutoff", metavar="EXPR", help="comma separated list of adaptive input cutoff points.", ) parser.add_argument( "--tie-adaptive-weights", action="store_true", help= "if set, ties the weights of adaptive softmax and adaptive input", ) parser.add_argument( "--tie-adaptive-proj", action="store_true", help= "if set, ties the projection weights of adaptive softmax and adaptive input", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) """LightConv and DynamicConv arguments""" parser.add_argument( "--decoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")', ) parser.add_argument("--decoder-glu", type=utils.eval_bool, help="glu after in proj") parser.add_argument( "--decoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) parser.add_argument( "--weight-dropout", type=float, metavar="D", help="dropout probability for conv weights", )
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers != args.decoder_layers: raise ValueError("--encoder-layers must match --decoder-layers") max_source_positions = getattr( args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS ) max_target_positions = getattr( args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim ) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = Embedding( num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad() ) if args.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError("--share-all-embeddings requires a joint dictionary") if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embed not compatible with --decoder-embed-path" ) if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to " "match --decoder-embed-dim" ) pretrained_decoder_embed = pretrained_encoder_embed args.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim, ) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim ): raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) if args.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False encoder = LSTMEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, max_source_positions=max_source_positions, ) decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=utils.eval_bool(args.decoder_attention), encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None ), max_target_positions=max_target_positions, residuals=False, ) return cls(encoder, decoder)
def build_model(cls, args, task, dictionary=None): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if getattr(args, 'max_target_positions', None) is not None: max_target_positions = args.max_target_positions else: max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim ) if args.share_decoder_input_output_embed: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError('--share-decoder-input-output-embeddings requires a joint dictionary') if args.decoder_embed_dim != args.decoder_out_embed_dim: raise ValueError( '--share-decoder-input-output-embeddings requires ' '--decoder-embed-dim to match --decoder-out-embed-dim' ) decoder = LSTMDecoder( dictionary=dictionary if dictionary else task.dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=False, # decoder-only language model doesn't support attention encoder_output_units=0, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == 'adaptive_loss' else None ), max_target_positions=max_target_positions, residuals=args.residuals ) if getattr(args, "lm_path", None): # args.lm_path = '../libri/wav2vec2_small.pt' print('load LSTM_LM from {}'.format(args.lm_path)) state = checkpoint_utils.load_checkpoint_to_cpu(args.lm_path) lm_args = state["args"] lm_args.data = args.data assert getattr(lm_args, "lm_path", None) is None task = tasks.setup_task(lm_args) decoder = task.build_model(lm_args) print('restore LSTM_LM from {}'.format(args.lm_path)) decoder.load_state_dict(state["model"], strict=True) decoder.dim_output = len(task.dictionary) return cls(decoder)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--input-dropout", type=float, metavar="D", help="dropout probability of the inputs", ) parser.add_argument( "--encoder-embed-path", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-conv-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-conv-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ), parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) """LightConv and DynamicConv arguments""" parser.add_argument( "--encoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")', ) parser.add_argument( "--decoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")', ) parser.add_argument( "--encoder-glu", type=utils.eval_bool, help="glu after in proj" ) parser.add_argument( "--decoder-glu", type=utils.eval_bool, help="glu after in proj" ) parser.add_argument( "--encoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument( "--decoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) parser.add_argument( "--weight-dropout", type=float, metavar="D", help="dropout probability for conv weights", )
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution\'s out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution\'s kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution\'s strides") parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N", help="encoder rnn\'s hidden size") parser.add_argument("--encoder-rnn-layers", type=int, metavar="N", help="number of rnn encoder layers") parser.add_argument("--encoder-rnn-bidirectional", type=lambda x: utils.eval_bool(x), help="make all rnn layers of encoder bidirectional") parser.add_argument("--encoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension") parser.add_argument("--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding") parser.add_argument("--decoder-freeze-embed", action="store_true", help="freeze decoder embeddings") parser.add_argument("--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension") parser.add_argument("--decoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn decoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--attention-type", type=str, metavar="STR", choices=["bahdanau", "luong"], help="attention type") parser.add_argument("--attention-dim", type=int, metavar="N", help="attention dimension") parser.add_argument("--need-attention", action="store_true", help="need to return attention tensor for the caller") parser.add_argument("--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion") parser.add_argument("--share-decoder-input-output-embed", type=lambda x: utils.eval_bool(x), help="share decoder input and output embeddings") parser.add_argument("--pretrained-lm-checkpoint", type=str, metavar="STR", help="path to load checkpoint from pretrained language model(LM), " "which will be present and kept fixed during training.") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D", help="dropout probability for encoder rnn\'s input") parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D", help="dropout probability for encoder rnn\'s output") parser.add_argument("--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding") parser.add_argument("--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output") # Scheduled sampling options parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p), metavar="P_1,P_2,...,P_N", default=[1.0], help="scheduled sampling probabilities of sampling the truth " "labels for N epochs starting from --start-schedule-sampling-epoch; " "all later epochs using P_N") parser.add_argument("--start-scheduled-sampling-epoch", type=int, metavar="N", default=1, help="start scheduled sampling from the specified epoch")
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.decoder_layerdrop = args.decoder_layerdrop self.only_drop_topk = args.only_drop_topk self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: if self.only_drop_topk > 0: self.layers = PartLayerDropModuleList( p=self.decoder_layerdrop, top_k=self.only_drop_topk, layer_num=args.decoder_layers) else: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.output_embed_dim, len(dictionary), bias=False) nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-conv-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument( '--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-conv-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument( '--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') """LightConv and DynamicConv arguments""" parser.add_argument( '--encoder-kernel-size-list', type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")') parser.add_argument( '--decoder-kernel-size-list', type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--encoder-glu', type=utils.eval_bool, help='glu after in proj') parser.add_argument('--decoder-glu', type=utils.eval_bool, help='glu after in proj') parser.add_argument('--encoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=utils.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights')