def __init__(self, args): super().__init__() self.embedding_dim = args.decoder_embed_dim self.self_attn1 = MultiheadAttention( self.embedding_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.self_attn2 = MultiheadAttention( self.embedding_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.self_attn_layer_norm_1 = LayerNorm(self.embedding_dim) self.self_attn_layer_norm_2 = LayerNorm(self.embedding_dim) self.self_attn_layer_norm = LayerNorm(self.embedding_dim * 2) self.fc1 = Linear(self.embedding_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embedding_dim) self.final_layer_norm = LayerNorm(self.embedding_dim) self.need_attn = True self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): # Lite lite_args = deepcopy(args) lite_args.decoder_layers = 1 super().__init__( lite_args, dictionary, embed_tokens, no_encoder_attn, ) # Always do encoder attention in NAT self.bottom_nat = NATransformerDecoder( args, dictionary, embed_tokens, no_encoder_attn=False, ) self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.pad = dictionary.pad() if self.args.project_nat: self.project_nat = Linear(self.output_embed_dim, self.output_embed_dim, bias=True) if self.args.project_at: self.project_at = Linear(self.output_embed_dim, self.output_embed_dim, bias=True)
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = BidirectionalMultiheadSelfAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, mask_curr_state=not args.unmask_curr_state, ) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.decoder_normalize_before self.fwd_layer_norm = LayerNorm(self.embed_dim, export=args.char_inputs) self.bwd_layer_norm = LayerNorm(self.embed_dim, export=args.char_inputs) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=args.char_inputs)
def build_model(cls, args, task): mode = { e.split('=')[0]: e.split('=')[1] if len(e.split('=')) > 1 else None for e in args.user_mode.split(',') } if 'gated' in mode: tmodel = GatedTransformerModel.build_model(args, task) elif any([m in mode for m in ['decomposable', 'sep_lm', 'sep_lm1']]): tmodel = DecomposableTransformerModel.build_model(args, task) elif any([m in mode for m in ['attn_endorse', 'dbg_log_endorsement']]): tmodel = SimpleTransformerModel.build_model( args, task, DecoderModelLayer=UserTransformerDecoderLayer) else: tmodel = SimpleTransformerModel.build_model(args, task) model = DistantTransformerModel(tmodel) model.args = args model.user_mode = mode model.sampler_grad = SequenceGeneratorGrad( model.model.decoder.dictionary, beam_size=1, max_len_b=60) model.sampler = SequenceGenerator(model.model.decoder.dictionary, beam_size=1, max_len_b=60) model.decoder = ProxyDecoder(tmodel, model.user_mode, args, task, model.sampler_grad, model.sampler) model.encoder = ProxyEncoder(tmodel, model.user_mode, args, task, model.sampler_grad, model.sampler) tmodel.encoder.user_mode = mode tmodel.decoder.user_mode = mode if any([ m in mode for m in [ 'diff_lm', 'pretrain_lm', 'sep_lm', 'max_lm_margin', 'sep_lm2', 'sep_lm3' ] ]): model.lm = TransformerDecoder(args, tmodel.decoder.dictionary, tmodel.decoder.embed_tokens, no_encoder_attn=True) model.decoder.lm = model.lm if 'sep_lm3' in mode: tmodel.decoder.gate_fc1 = Linear( len(tmodel.decoder.dictionary) * 2, len(tmodel.decoder.dictionary)) tmodel.decoder.gate_fc2 = Linear(len(tmodel.decoder.dictionary), 1) if any([m in mode for m in ['endorsement', 'rl_edm', 'beam_endorse']]): model.edm = EndorsementDetectorModel.build_model(args, task) model.decoder.edm = model.encoder.edm = model.edm model.encoder.edm.decoder.user_mode = model.encoder.edm.encoder.user_mode = mode if any([m in mode for m in ['self_align']]): model.self_edm = EndorsementDetectorModel.build_model( args, task) model.decoder.self_edm = model.encoder.self_edm = model.self_edm model.encoder.self_edm.decoder.user_mode = model.encoder.self_edm.encoder.user_mode = mode return model
def __init__(self, model, uer_mode, args, task, sampler_grad, sampler): super(ProxyDecoder, self).__init__(model, uer_mode, args, task) self.sampler_grad = sampler_grad self.sampler = sampler if self.has_mode('sep_lm2'): self.gate_fc1 = Linear( len(self.model.decoder.dictionary) * 2, len(self.model.decoder.dictionary)) self.gate_fc2 = Linear(len(self.model.decoder.dictionary), len(self.model.decoder.dictionary))
def __init__(self, args, domain_adv): super().__init__() self.embed_dim = args.encoder_embed_dim self.domain_adv = domain_adv self.label = dict() for i, domain in enumerate(args.domains): self.label[domain] = i self.fc1 = Linear(self.embed_dim, self.embed_dim, bias=False) self.fc2 = Linear(self.embed_dim, 1, bias=False) self.fc3 = Linear(self.embed_dim, len(args.domains), bias=False)
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.cross_self_attention = getattr(args, "cross_self_attention", False) self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not self.cross_self_attention, ) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu")) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList( [LayerNorm(self.embed_dim) for i in range(2)])
def __init__( self, args, conv_layers_before=None, input_size=83, transformer_context=None, num_targets=None, chunk_width=None, chunk_left_context=0, training_stage=True, ): super().__init__( args, conv_layers_before=conv_layers_before, input_size=input_size, transformer_context=transformer_context, ) receptive_field_radius = sum(conv.padding[0] for conv in conv_layers_before.convolutions) \ if conv_layers_before is not None else 0 assert chunk_width is None or chunk_width > 0 assert (conv_layers_before is None and chunk_left_context >= 0) or \ (conv_layers_before is not None and chunk_left_context >= receptive_field_radius) self.out_chunk_begin = self.output_lengths(chunk_left_context + 1) - 1 self.out_chunk_end = self.output_lengths(chunk_left_context + chunk_width) \ if chunk_width is not None else None self.training_stage = training_stage # only for encoder-only model self.fc_out = Linear(args.encoder_embed_dim, num_targets, dropout=self.dropout_module.p) \ if num_targets is not None else None
def __init__( self, args, dictionary, embed_tokens, embed_other_list, no_encoder_attn, channel_sizes, ): super().__init__(args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn) # embed each channel and project if dimensions do not match self.embed_other_list = torch.nn.ModuleList(embed_other_list) self.proj_other_list = torch.nn.ModuleList() dim = embed_tokens.embedding_dim for embed_other in embed_other_list: other_dim = 1 if embed_other is None else embed_other.embedding_dim self.proj_other_list.append( nn.Linear(other_dim, dim) if other_dim != dim else None) # tranformer output to prediction self.channel_sizes = channel_sizes self.project_out_dim = Linear(embed_tokens.embedding_dim, sum(channel_sizes), bias=False)
def __init__(self, args): super().__init__(args) self.spk_emb_proj = None if args.target_speaker_embed: self.spk_emb_proj = Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim)
def __init__(self, args, conv_layers_before=None, input_size=83, transformer_context=None): self.args = args super(TransformerEncoder, self).__init__(None) # no src dictionary self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = args.encoder_embed_dim self.max_source_positions = args.max_source_positions self.conv_layers_before = conv_layers_before self.fc0 = Linear(input_size, embed_dim) if input_size != embed_dim else None self.embed_positions = (PositionalEmbedding( self.output_lengths(self.max_source_positions), embed_dim, 0, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_encoder_layer(args) for i in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.transformer_context = transformer_context
def __init__(self, args, no_encoder_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.source_encoder_attn = None self.mask_encoder_attn = None self.encoder_attn_layer_norm = None self.concat_dense = None else: self.source_encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.mask_encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.concat_dense = Linear(2 * self.embed_dim, self.embed_dim, bias=True) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True self.onnx_trace = False
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = BidirectionalMultiheadSelfAttention( self.embed_dim, (args.decoder_attention_heads * 2) if args.double_final_heads else args.decoder_attention_heads, dropout=args.attention_dropout, concat_final_q=args.concat_final_q, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.fwd_layer_norm = LayerNorm(self.embed_dim) self.bwd_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): # use the TransformerDecoder's __init__ super(LevenshteinTransformerDecoder, self).__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) self.dictionary = dictionary self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim) self.label_tau = getattr(args, "label_tau", None)
def __init__(self, args, dictionary, embed_tokens, char_model, no_encoder_attn=False): super().__init__(args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn) self.char_model = char_model self.project_in_combine_dim = Linear(args.input_dim, args.decoder_embed_dim, bias=False)
def __init__(self, num_embeddings, embed_dim, padding_idx, num_stacked=1): super().__init__(num_embeddings, embed_dim, padding_idx) # follow transformer.Embedding nn.init.normal_(self.weight, mean=0, std=embed_dim**-0.5) nn.init.constant_(self.weight[padding_idx], 0) self.offset = ( 4 # skip <bos>, <pad>, <eos>, <unk>, specific to fairseq dictionary ) self.vocab_size = num_embeddings - self.offset self.num_stacked = num_stacked if self.num_stacked > 1: self.project_in_dim = Linear(embed_dim * num_stacked, embed_dim, bias=False)
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): super().__init__(args, dictionary, embed_tokens, no_encoder_attn, output_projection) self.n_frames_per_step = args.n_frames_per_step self.out_proj_n_frames = (Linear( self.output_embed_dim, self.output_embed_dim * self.n_frames_per_step, bias=False, ) if self.n_frames_per_step > 1 else None)
def __init__(self, args, conv_layers_before=None, input_size=83): super(TransformerEncoder, self).__init__(None) # no src dictionary self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.conv_layers_before = conv_layers_before self.fc0 = Linear(input_size, args.encoder_embed_dim) \ if input_size != args.encoder_embed_dim else None self.max_source_positions = args.max_source_positions self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, add_topic_pre, add_topic_post): super().__init__(args, dictionary, embed_tokens) self.add_topic_pre, self.add_topic_post = add_topic_pre, add_topic_post with open("/cache/code_dir/ETM/checkpoint", 'rb') as f: sys.modules["etm"] = etm m = torch.load(f) m = m.cuda() self.topic_embedding = m.rho.weight with open('/cache/code_dir/ETM/vocab.pkl', 'rb') as f: self.vo = pickle.load(f) self.vocab = [] f = open('/cache/data_dir/dict.txt') for row in f.readlines(): self.vocab.append(row.split(" ")[0]) f.close() self.t = Linear(300, 512)
def __init__(self, args, dictionary, embed_tokens, left_pad=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.unk_idx = dictionary.unk() self.eos_idx = dictionary.eos() self.max_target_positions = args.max_target_positions self.output_dim = args.decoder_embed_dim self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.input_dropout = torch.tensor( args.input_dropout) if args.input_dropout > 0 else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer(args) for _ in range(args.decoder_layers) ]) if not args.single_tower else self.forward_layers self.single_tower = args.single_tower self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(embed_dim * 2, embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=embed_dim**-0.5) else: self.share_input_output_embed = False
def __init__( self, cfg, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, scheduled_sampling_rate_scheduler=None, ): is_no_token_positional_embeddings_changed = False if (not cfg.no_token_positional_embeddings and cfg.decoder.relative_positional_embeddings): cfg.no_token_positional_embeddings = True is_no_token_positional_embeddings_changed = True logger.info( "disabled decoder's absolute positional embeddings as decoder_relative_positional_embeddings is True." ) self.cfg = cfg super(TransformerDecoderBase, self).__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)) self.decoder_layerdrop = cfg.decoder.layerdrop self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder.embed_dim self.embed_dim = embed_dim self.output_embed_dim = cfg.decoder.output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt( embed_dim) if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None self.cross_self_attention = cfg.cross_self_attention if cfg.decoder.relative_positional_embeddings: if cfg.decoder.learned_pos: rel_pos_embed_list = [ RelativePositionalEmbedding( cfg.decoder.embed_dim, padding_idx=None, max_size=cfg.max_target_positions, learned=True, ) for _ in range(cfg.decoder.layers) ] else: rel_pos_embed = RelativePositionalEmbedding( cfg.decoder.embed_dim, padding_idx=None, max_size=None, learned=False, ) # single instance referenced across layers rel_pos_embed_list = [rel_pos_embed] * cfg.decoder.layers else: rel_pos_embed_list = [None] * cfg.decoder.layers if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer( cfg, no_encoder_attn, positional_embedding=rel_pos_embed_list[i]) for i in range(cfg.decoder.layers) ]) self.num_layers = len(self.layers) if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = output_projection if self.output_projection is None: self.build_output_projection(cfg, dictionary, embed_tokens) if is_no_token_positional_embeddings_changed: cfg.no_token_positional_embeddings = not cfg.no_token_positional_embeddings self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler for layer in self.layers: if isinstance( layer, TransformerWithRelativePositionalEmbeddingDecoderLayerBase ): layer.need_attn = False # make validation fast
def __init__(self, args, dictionary, embed_tokens, no_encoder_decoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayerPhase2(args, no_encoder_decoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim self.embed_tokens = embed_tokens self.lstm_units = args.decoder_lstm_units self.num_layers = args.decoder_layers self.initial_input_dim = embed_dim self.encoder_output_dim = args.encoder_embed_dim if args.decoder_reduced_attention_dim is None: self.attention_dim = self.encoder_output_dim else: self.attention_dim = args.decoder_reduced_attention_dim self.input_dim = self.lstm_units + self.attention_dim self.num_attention_heads = args.decoder_attention_heads self.bottleneck_dim = args.decoder_out_embed_dim self.initial_rnn_layer = nn.LSTM( input_size=self.initial_input_dim, hidden_size=self.lstm_units ) self.initial_layernorm = LayerNorm(self.lstm_units) self.proj_encoder_layer = None if self.attention_dim != self.encoder_output_dim: self.proj_encoder_layer = Linear( self.encoder_output_dim, self.attention_dim ) self.proj_layer = None if self.lstm_units != self.attention_dim: self.proj_layer = Linear( self.lstm_units, self.attention_dim ) self.attention = MultiheadAttention( self.attention_dim, self.num_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.extra_rnn_layers = nn.ModuleList([]) self.extra_layernorms = nn.ModuleList([]) for _ in range(self.num_layers - 1): self.extra_rnn_layers.append( nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_units) ) self.extra_layernorms.append( LayerNorm(self.lstm_units) ) self.bottleneck_layer = None if self.bottleneck_dim is not None: self.out_embed_dim = self.bottleneck_dim self.bottleneck_layer = Linear( self.input_dim, self.out_embed_dim ) else: self.out_embed_dim = self.input_dim if not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.out_embed_dim ** -0.5) else: assert self.bottleneck_dim == args.decoder_embed_dim, (self.bottleneck_dim, args.decoder_embed_dim)
def __init__(self, embed_dim, num_classes): super().__init__() self.proj = Linear(2 * embed_dim, num_classes)
def __init__(self, args, dictionary, embed_tokens, classification_head=None): super().__init__(dictionary) self.onnx_trace = False self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.self_target = args.self_target self.future_target = args.future_target self.past_target = args.past_target self.char_inputs = args.char_inputs self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(self.embed_dim) self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.forward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.backward_layers = nn.ModuleList([ TransformerDecoderLayer( args, no_encoder_attn=True, add_bias_kv=not args.no_bias_kv, add_zero_attn=args.no_bias_kv, ) for _ in range(args.decoder_layers) ]) self.full_attn_layer = None self.full_linear_layer = None if self.self_target: if args.linear_final_layer: self.full_linear_layer = Linear(self.embed_dim * 2, self.embed_dim, args.linear_final_layer_bias) else: self.full_attn_layer = BidirectionalTransformerDecoderLayer( args) self.load_softmax = not getattr(args, 'remove_head', False) self.embed_out = None self.adaptive_softmax = None self.classification_head = classification_head if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), args.decoder_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.embed_dim**-0.5)
def __init__( self, cfg, return_fc=False, pre_encoder=None, input_size=83, transformer_context=None, ): self.cfg = cfg super(TransformerEncoderBase, self).__init__(None) # no src dictionary self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) ) self.encoder_layerdrop = cfg.encoder.layerdrop self.return_fc = return_fc embed_dim = cfg.encoder.embed_dim self.max_source_positions = cfg.max_source_positions self.pre_encoder = pre_encoder self.fc0 = Linear(input_size, embed_dim) if input_size != embed_dim else None self.embed_scale = ( 1.0 if cfg.no_scale_embedding or self.fc0 is not None # always diable scaling if fc0 is present else math.sqrt(embed_dim) ) if ( not cfg.no_token_positional_embeddings and cfg.encoder.relative_positional_embeddings ): logger.info( "disabled encoder's absolute positional embeddings as encoder_relative_positional_embeddings is True." ) self.embed_positions = ( PositionalEmbedding( self.output_lengths(self.max_source_positions), embed_dim, 0, learned=cfg.encoder.learned_pos, ) if not cfg.no_token_positional_embeddings and not cfg.encoder.relative_positional_embeddings else None ) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None if cfg.encoder.relative_positional_embeddings: if cfg.encoder.learned_pos: rel_pos_embed_list = [ RelativePositionalEmbedding( cfg.encoder.embed_dim, padding_idx=None, max_size=self.output_lengths(cfg.max_source_positions), learned=True, ) for _ in range(cfg.encoder.layers) ] else: rel_pos_embed = RelativePositionalEmbedding( cfg.encoder.embed_dim, padding_idx=None, max_size=None, learned=False, ) # single instance referenced across layers rel_pos_embed_list = [rel_pos_embed] * cfg.encoder.layers else: rel_pos_embed_list = [None] * cfg.encoder.layers if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_encoder_layer( cfg, positional_embedding=rel_pos_embed_list[i] ) for i in range(cfg.encoder.layers) ] ) self.num_layers = len(self.layers) if cfg.encoder.normalize_before and cfg.encoder.layer_type != "conformer": self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None self.transformer_context = transformer_context