def __init__(self, opt, dicts, positional_encoder, language_embeddings=None, ignore_source=False): self.death_rate = opt.death_rate self.n_heads = opt.n_heads self.checkpointing = opt.checkpointing self.absolute_position_encoding = opt.absolute_position_encoding self.late_emb_scale = opt.decoder_late_emb_scale self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length self.reversible = opt.tgt_reversible # build_modules will be called from the inherited constructor super(RelativeTransformerDecoder, self).__init__(opt, dicts, positional_encoder, language_embeddings, ignore_source, allocate_positions=False) if self.learnable_position_encoding: self.positional_encoder = None else: if not self.absolute_position_encoding: # or using pre-set sinusoidal self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) else: self.positional_encoder = FastSinusoidalPositionalEncoding(opt.model_size) self.d_head = self.model_size // self.n_heads if opt.rezero or opt.post_norm: self.postprocess_layer = Identity()
def __init__(self, opt, dicts, positional_encoder, language_embeddings=None, ignore_source=False): self.death_rate = opt.death_rate self.max_memory_size = opt.max_memory_size self.stream_context = opt.stream_context self.extra_context_size = opt.extra_context_size self.n_heads = opt.n_heads self.fast_self_attn = opt.fast_self_attention self.mpw = opt.multilingual_partitioned_weights self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length # build_modules will be called from the inherited constructor super().__init__(opt, dicts, positional_encoder, language_embeddings, ignore_source, allocate_positions=False) if self.learnable_position_encoding: self.positional_encoder = None else: self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) self.d_head = self.model_size // self.n_heads # Parameters for the position biases - deprecated. kept for backward compatibility # self.r_w_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_head)) # self.r_r_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_head)) self.mln = opt.multilingual_layer_norm if not opt.rezero: self.postprocess_layer = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) else: self.postprocess_layer = Identity()
def __init__(self, opt, dicts, positional_encoder, encoder_type='text', language_embeddings=None): self.death_rate = opt.death_rate self.learnable_position_encoding = opt.learnable_position_encoding self.layer_modules = list() self.asynchronous = opt.asynchronous self.max_memory_size = opt.max_memory_size self.extra_context_size = opt.extra_context_size self.experimental = opt.experimental self.unidirectional = opt.unidirectional self.reversible = opt.src_reversible self.n_heads = opt.n_heads self.fast_self_attn = opt.fast_self_attention self.checkpointing = opt.checkpointing self.mpw = opt.multilingual_partitioned_weights self.multilingual_linear_projection = opt.multilingual_linear_projection self.mln = opt.multilingual_layer_norm self.no_input_scale = opt.no_input_scale self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length # TODO: multilingually linear transformation # build_modules will be called from the inherited constructor super().__init__(opt, dicts, positional_encoder, encoder_type, language_embeddings) # learnable position encoding if self.learnable_position_encoding: # raise NotImplementedError self.positional_encoder = None else: # or using pre-set sinusoidal self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) self.d_head = self.model_size // self.n_heads if self.multilingual_linear_projection: self.linear_proj = nn.Parameter(torch.Tensor(opt.n_languages, self.model_size, self.model_size)) std_ = math.sqrt(2.0 / (self.model_size + self.model_size)) torch.nn.init.normal_(self.linear_proj, 0.0, std_) self.mln = opt.multilingual_layer_norm if not opt.rezero: self.postprocess_layer = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) else: self.postprocess_layer = Identity()
def __init__(self, opt, dicts, positional_encoder, encoder_type='text', language_embeddings=None): self.death_rate = opt.death_rate self.learnable_position_encoding = opt.learnable_position_encoding self.layer_modules = list() self.unidirectional = opt.unidirectional self.n_heads = opt.n_heads self.n_languages = opt.n_languages self.checkpointing = opt.checkpointing self.absolute_position_encoding = opt.absolute_position_encoding self.early_emb_scale = opt.encoder_early_emb_scale self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length self.reversible = opt.src_reversible # build_modules will be called from the inherited constructor super(RelativeTransformerEncoder, self).__init__(opt, dicts, positional_encoder, encoder_type, language_embeddings) if not self.early_emb_scale and (self.use_language_embedding or self.absolute_position_encoding): print( "[INFO] Embedding will be scaled after being added with embedding and position encoding." "\n[INFO] For multilingual models its advisable to use -encoder_early_emb_scale" ) # learnable position encoding if self.learnable_position_encoding: self.positional_encoder = None else: if not self.absolute_position_encoding: # or using pre-set sinusoidal self.positional_encoder = SinusoidalPositionalEmbedding( opt.model_size) else: self.positional_encoder = FastSinusoidalPositionalEncoding( opt.model_size) if opt.rezero or opt.post_norm: self.postprocess_layer = Identity() self.d_head = self.model_size // self.n_heads
def preprocessing(rezero, *args, **kwargs): if rezero: return Identity() else: return PrePostProcessing(*args, **kwargs)