def add_args(parser): parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", action=FileContentsAction, ) parser.add_argument( "--langs", default=None, type=csv_str_list, help= "a list of languages comma sperated languages which can appear in lang-pairs; " "note that the ordering determines language token IDs", ) parser.add_argument( "--lang-dict", default=None, type=str, help="an external file which contains a list of " "languages which can appear in lang-pairs; " "note that the ordering determines language token IDs; " "--langs and --lang-dict are two exclusive options", ) parser.add_argument( '--source-dict', default=None, type=str, help= 'path to source dictionary; if specified it will override per language dictionary loading' ) parser.add_argument( '--target-dict', default=None, type=str, help= 'path to target dictionary; if specified it will override per language dictionary loading' ) parser.add_argument( "--lang-tok-style", default=LangTokStyle.multilingual.value, type=str, choices=[ LangTokStyle.multilingual.value, LangTokStyle.mbart.value ], help="language token styles", ) parser.add_argument( "--load-alignments", action="store_true", help="load the binarized alignments", ) parser.add_argument( "--left-pad-source", default="True", type=str, metavar="BOOL", help="pad the source on the left", ) parser.add_argument( "--left-pad-target", default="False", type=str, metavar="BOOL", help="pad the target on the left", ) parser.add_argument( "--max-source-positions", default=1024, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) parser.add_argument( "--upsample-primary", default=1, type=int, help="amount to upsample primary dataset", ) parser.add_argument( "--truncate-source", action="store_true", default=False, help="truncate source to max-source-positions", ) parser.add_argument( "--encoder-langtok", default=None, type=str, choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value], metavar="SRCTGT", help= "prepend to the beginning of source sentence the source or target " "language token. (src/tgt)", ) parser.add_argument( "--decoder-langtok", action="store_true", help= "prepend to the beginning of target sentence the target language token", ) parser.add_argument("--lang-tok-replacing-bos-eos", action="store_true", default=False) parser.add_argument( "--enable-lang-ids", default=False, action="store_true", help="whether to include language IDs in samples", ) parser.add_argument( "--enable-reservsed-directions-shared-datasets", default=False, action="store_true", help="whether to allow datasets be used in reversed directions", ) parser.add_argument( "--extra-data", help='a dictionary of data name to this path, \ e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--extra-lang-pairs", help='a dictionary of data name to the language pairs they serve, \ e.g. {"mined": comma-separated-lang-pairs, "denoised": comma-separated-lang-pairs}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--fixed-dictionary", help="Fixed dictionary to use with model path", default=None, type=str, ) parser.add_argument( "--langtoks-specs", help= 'a list of comma separated data types that a set of language tokens to be specialized for, \ e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \ distinguish languages in different training data types. If not specified, default language \ tokens per languages will be added', default=LangTokSpec.main.value, type=csv_str_list, ) parser.add_argument( "--langtoks", help='a dictionary of how to add language tokens, \ e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \ ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--sampling-weights-from-file", help= 'a file contain a python dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=str, ) parser.add_argument( "--sampling-weights", help='a dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--virtual-epoch-size", default=None, type=int, help="virtual epoch size to speed up data loading", ) parser.add_argument( "--virtual-data-size", default=None, type=int, help="virtual data size of the whole joint dataset to speed" "up data loading and have specific dynamic sampling strategy interval", )
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, relaxed_attention_weight=0.0, q_noise=0.0, qn_block_size=8, # TODO: pass in config rather than string. # config defined in xformers.components.attention.AttentionConfig xformers_att_config: Optional[str] = None, xformers_blocksparse_layout: Optional[ torch.Tensor] = None, # This should be part of the config xformers_blocksparse_blocksize: Optional[ int] = 16, # This should be part of the config positional_embedding=None, ): super().__init__() xformers_att_config = utils.eval_str_dict(xformers_att_config) self.use_xformers = xformers_att_config is not None if self.use_xformers and not _xformers_available: raise ImportError("\n\n Please install xFormers.") self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention self.relaxed_attention_weight = relaxed_attention_weight assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.beam_size = 1 self.positional_embedding = positional_embedding if (self.positional_embedding is not None and not self.positional_embedding.learnable): self.pos_bias_u = nn.Parameter(torch.Tensor(embed_dim)) self.pos_bias_v = nn.Parameter(torch.Tensor(embed_dim)) self.pos_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=False), q_noise, qn_block_size) else: self.pos_bias_u = self.pos_bias_v = self.pos_proj = None self.reset_parameters() if self.use_xformers: xformers_att_config["dropout"] = xformers_att_config.get( "dropout", dropout) xformers_att_config["num_heads"] = xformers_att_config.get( "num_heads", num_heads) if xformers_blocksparse_layout is not None: # Could be part of a single config passed only once xformers_att_config[ "block_size"] = xformers_blocksparse_blocksize xformers_att_config["layout"] = xformers_blocksparse_layout xformers_att_config["name"] = "blocksparse" self.attention = build_attention(xformers_att_config) self.onnx_trace = False self.skip_embed_dim_check = False