def transformer_lm_gpt3_175(args): # 175B params args.decoder_layers = safe_getattr(args, "decoder_layers", 96) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 96) base_gpt3_architecture(args)
def transformer_lm_gpt3_6_7(args): # 6.7B params args.decoder_layers = safe_getattr(args, "decoder_layers", 32) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) base_gpt3_architecture(args)
def transformer_lm_gpt3_13(args): # 13B params args.decoder_layers = safe_getattr(args, "decoder_layers", 40) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 40) base_gpt3_architecture(args)
def transformer_lm_gpt3_large(args): # 760M params args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_gpt3_architecture(args)
def transformer_lm_gpt3_xl(args): # 1.3B params args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) base_gpt3_architecture(args)
def transformer_lm_gpt3_small(args): # 125M params args.decoder_layers = safe_getattr(args, "decoder_layers", 12) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12) base_gpt3_architecture(args)
def transformer_lm_baevski_gbw(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) transformer_lm_big(args)
def transformer_lm_big(args): args.decoder_layers = safe_getattr(args, "decoder_layers", 12) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_lm_architecture(args)
def xlm_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 16) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 1280 * 4) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args)
def roberta_large_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 24) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args)
def base_gpt3_architecture(args): args.decoder_input_dim = args.decoder_embed_dim args.decoder_output_dim = args.decoder_embed_dim args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4) # GPT-3 used learned positional embeddings, rather than sinusoidal args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True) args.dropout = safe_getattr(args, "dropout", 0.0) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") args.share_decoder_input_output_embed = True base_lm_architecture(args)
def build_model(cls, args, task): """Build a new model instance.""" if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if safe_getattr(args, "max_target_positions", None) is None: args.max_target_positions = safe_getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.source_dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.source_dictionary), task.source_dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, options.eval_str_list(args.adaptive_input_cutoff, type=int), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: embed_tokens = cls.build_embedding( args, task.source_dictionary, args.decoder_input_dim ) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert ( args.adaptive_softmax_cutoff == args.adaptive_input_cutoff ), "{} != {}".format( args.adaptive_softmax_cutoff, args.adaptive_input_cutoff ) assert args.decoder_input_dim == args.decoder_output_dim decoder = TransformerDecoder( args, task.target_dictionary, embed_tokens, no_encoder_attn=True ) return cls(decoder)
def __setattr__(self, name, value): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) setattr(sub, match[2], value) else: super().__setattr__(name, value)
def _copy_keys(args, cls, prefix, seen): """ copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim """ cfg = cls() for fld in fields(cls): # for all the fields in the DC, find the fields (e.g. embed_dim) # in the namespace with the prefix (e.g. decoder) # and set it on the dc. args_key = f"{prefix}_{fld.name}" if safe_hasattr(args, args_key): seen.add(args_key) setattr(cfg, fld.name, safe_getattr(args, args_key)) if safe_hasattr(args, fld.name): seen.add(fld.name) setattr(cfg, fld.name, safe_getattr(args, fld.name)) return cfg
def from_namespace(cls, args): if args is None: return None if not isinstance(args, cls): seen = set() config = cls() # currently, we can go generically from DC fields to args hierarchically # but we can't easily deconstruct a flat namespace to a hierarchical # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple # for now. for fld in fields(cls): # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()` if fld.name == "decoder": if safe_hasattr(args, "decoder"): # in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC seen.add("decoder") config.decoder = DecoderConfig(**args.decoder) else: config.decoder = cls._copy_keys( args, DecoderConfig, "decoder", seen) elif fld.name == "encoder": # same but for encoder if safe_hasattr(args, "encoder"): seen.add("encoder") config.encoder = EncDecBaseConfig(**args.encoder) else: config.encoder = cls._copy_keys( args, EncDecBaseConfig, "encoder", seen) elif fld.name == "quant_noise": # same but for quant_noise if safe_hasattr(args, "quant_noise"): seen.add("quant_noise") config.quant_noise = QuantNoiseConfig( **args.quant_noise) else: config.quant_noise = cls._copy_keys( args, QuantNoiseConfig, "quant_noise", seen) elif safe_hasattr(args, fld.name): # if it's not a structure field, it's just a normal field, copy it over seen.add(fld.name) setattr(config, fld.name, safe_getattr(args, fld.name)) # we got all the fields defined in the dataclass, but # the argparse namespace might have extra args for two reasons: # - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this # - some places expect args to be there but never define them args_dict = (args._asdict() if safe_hasattr(args, "_asdict") else vars(args) if safe_hasattr(args, "__dict__") else {} ) # namedtupled doesn't have __dict__ :-/ for key, value in args_dict.items(): if key not in seen: setattr(config, key, value) return config else: return args
def transformer_lm_gpt2_big(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400) args.decoder_layers = safe_getattr(args, "decoder_layers", 48) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args)
def transformer_lm_gpt2_medium(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120) args.decoder_layers = safe_getattr(args, "decoder_layers", 36) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args)
def build_self_attention_selection(self, embed_dim, args, self_attn_head_selector=None, add_bias_kv=False, add_zero_attn=False): return MultiheadAttentionSelection( embed_dim, args.total_decoder_attention_heads, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not safe_getattr(args, "cross_self_attention"), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, layer_idx=self.layer_idx, attn_head_selector=self_attn_head_selector, )
def __getattr__(self, name): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) return safe_getattr(sub, match[2]) raise AttributeError(f"invalid argument {name}.")
def base_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 12) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 3072) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 12) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0) args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0) args.max_source_positions = safe_getattr(args, "max_positions", 512) args.no_token_positional_embeddings = safe_getattr( args, "no_token_positional_embeddings", False) # BERT has a few structural differences compared to the original Transformer args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True) args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True) args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", False) args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn", "tanh") args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta", False) # Adaptive input config args.adaptive_input = safe_getattr(args, "adaptive_input", False) # LayerDrop config args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0) args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep", None) # Quantization noise config args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) # R4F config args.spectral_norm_classification_head = safe_getattr( args, "spectral_norm_classification_head", False)
def transformer_lm_baevski_wiki103(args): args.decoder_layers = safe_getattr(args, "decoder_layers", 16) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) args.dropout = safe_getattr(args, "dropout", 0.3) args.adaptive_input = safe_getattr(args, "adaptive_input", True) args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True) args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", "20000,60000") args.adaptive_softmax_cutoff = safe_getattr( args, "adaptive_softmax_cutoff", "20000,60000" ) args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1) args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True) transformer_lm_big(args)
def base_lm_architecture(args): # backward compatibility for older model checkpoints if safe_hasattr(args, "no_tie_adaptive_proj"): # previous models defined --no-tie-adaptive-proj, so use the existence of # that option to determine if this is an "old" model checkpoint args.no_decoder_final_norm = True # old models always set this to True if args.no_tie_adaptive_proj is False: args.tie_adaptive_proj = True if safe_hasattr(args, "decoder_final_norm"): args.no_decoder_final_norm = not args.decoder_final_norm args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = safe_getattr(args, "decoder_layers", 6) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0) args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4) args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False) args.activation_fn = safe_getattr(args, "activation_fn", "relu") args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0) args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None) args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) args.base_layers = safe_getattr(args, "base_layers", 0) args.base_sublayers = safe_getattr(args, "base_sublayers", 1) args.base_shuffle = safe_getattr(args, "base_shuffle", False) args.add_bos_token = safe_getattr(args, "add_bos_token", False) args.no_token_positional_embeddings = safe_getattr( args, "no_token_positional_embeddings", False ) args.share_decoder_input_output_embed = safe_getattr( args, "share_decoder_input_output_embed", False ) args.character_embeddings = safe_getattr(args, "character_embeddings", False) args.decoder_output_dim = safe_getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = safe_getattr(args, "decoder_input_dim", args.decoder_embed_dim) # Model training is not stable without this args.decoder_normalize_before = True args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False) args.adaptive_input = safe_getattr(args, "adaptive_input", False) args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4) args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None) args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False) args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False) args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False) args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False) args.offload_activations = safe_getattr(args, "offload_activations", False) if args.offload_activations: args.checkpoint_activations = True
def roberta_prenorm_architecture(args): args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", True) base_architecture(args)
def __init__( self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = cfg.decoder.embed_dim self.dropout_module = FairseqDropout( cfg.dropout, module_name=self.__class__.__name__ ) self.quant_noise = cfg.quant_noise.pq self.quant_noise_block_size = cfg.quant_noise.pq_block_size self.cross_self_attention = cfg.cross_self_attention self.self_attn = self.build_self_attention( self.embed_dim, cfg, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.attn_ln = ( LayerNorm(self.embed_dim) if utils.safe_getattr(cfg, "scale_attn", False) else None ) self.nh = self.self_attn.num_heads self.head_dim = self.self_attn.head_dim scale_heads = utils.safe_getattr(cfg, "scale_heads", False) self.c_attn = ( nn.Parameter(torch.ones((self.nh,)), requires_grad=True) if scale_heads else None ) self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn) activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: # for backwards compatibility with models that use cfg.relu_dropout activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = cfg.decoder.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.ffn_layernorm = ( LayerNorm(cfg.decoder.ffn_embed_dim) if utils.safe_getattr(cfg, "scale_fc", False) else None ) self.w_resid = ( nn.Parameter( torch.ones( self.embed_dim, ), requires_grad=True, ) if utils.safe_getattr(cfg, "scale_resids", False) else None ) self.fc1 = self.build_fc1( self.embed_dim, cfg.decoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( cfg.decoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.need_attn = True self.onnx_trace = False