def __init__(self, cfg, encoder, decoder): super().__init__() self.encoder = encoder self.decoder = decoder check_type(self.encoder, FairseqEncoder) check_type(self.decoder, FairseqDecoder) self.proj_encoder = Linear(cfg.encoder.embed_dim, cfg.joint_dim) self.laynorm_proj_encoder = LayerNorm(cfg.joint_dim, export=cfg.export) self.proj_decoder = Linear(cfg.decoder.hidden_size, cfg.joint_dim) self.laynorm_proj_decoder = LayerNorm(cfg.joint_dim, export=cfg.export) assert hasattr(self.decoder, "embed_tokens") if cfg.share_decoder_input_output_embed: assert ( cfg.joint_dim == cfg.decoder.embed_dim ), "joint_dim and decoder.embed_dim must be the same if the two embeddings are to be shared" self.fc_out = nn.Linear( self.decoder.embed_tokens.embedding_dim, self.decoder.embed_tokens.num_embeddings, bias=False, ) self.fc_out.weight = self.decoder.embed_tokens.weight else: self.fc_out = nn.Linear(cfg.joint_dim, self.decoder.embed_tokens.num_embeddings, bias=False) nn.init.normal_(self.fc_out.weight, mean=0, std=cfg.joint_dim**-0.5) self.fc_out = nn.utils.weight_norm(self.fc_out, name="weight") self.cfg = cfg self.num_updates = 0
def __init__(self, dictionary, lang_dictionary, embedding, lang_embedding, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=1024, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, lang_embedding_size=32): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.lang_embeddings_size = lang_embedding_size self.lang_dictionary = lang_dictionary self.embed_langs = lang_embedding self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = embedding self.encoder_output_units = encoder_output_units self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) self.layers = nn.ModuleList([ LSTMCell( input_size=hidden_size + embed_dim + self.lang_embeddings_size + self.encoder_output_units if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, encoder_output_units=0, attn_type=None, attn_dim=0, need_attn=False, residual=False, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed if attn_type is None or attn_type.lower() == 'none': # no attention, no encoder output needed (language model case) need_attn = False encoder_output_units = 0 self.need_attn = need_attn self.residual = residual self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + (embed_dim if layer == 0 else hidden_size), hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attn_type is None or attn_type.lower() == 'none': self.attention = None elif attn_type.lower() == 'bahdanau': self.attention = speech_attention.BahdanauAttention( hidden_size, encoder_output_units, attn_dim, ) elif attn_type.lower() == 'luong': self.attention = speech_attention.LuongAttention( hidden_size, encoder_output_units, ) else: raise ValueError('unrecognized attention type.') if hidden_size + encoder_output_units != out_embed_dim: self.additional_fc = Linear(hidden_size + encoder_output_units, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, dictionary: Dictionary, embed_dim: int = 512, hidden_size: int = 512, out_embed_dim: int = 512, num_layers: int = 1, dropout_in: float = 0.1, dropout_out: float = 0.1, attention: bool = True, encoder_embed_dim: int = 512, encoder_output_units: int = 512, pretrained_embed: Optional[nn.Embedding] = None, share_input_output_embed: bool = False, adaptive_softmax_cutoff: Optional[int] = None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=hidden_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size) if attention else None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False): super().__init__() self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias) self.output_proj = Linear(input_embed_dim + source_embed_dim, output_embed_dim, bias=bias)
def __init__(self, dictionary, lang_dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, lang_embedding_size=32): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.lang_embedding_size = lang_embedding_size self.lang_dictionary = lang_dictionary self.embed_langs = nn.Embedding(len(lang_dictionary), lang_embedding_size) self.need_attn = False self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.encoder_output_units = encoder_output_units self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) input_size = hidden_size + embed_dim + lang_embedding_size + encoder_output_units self.layers = nn.ModuleList([ LSTMCell( input_size=input_size if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, conv_layers_before=None, input_size=83, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, residual=False, left_pad=False, padding_value=0., num_targets=None, chunk_width=20, chunk_left_context=0, training_stage=True, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, ): super().__init__( conv_layers_before=conv_layers_before, input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout_in=dropout_in, dropout_out=dropout_in, bidirectional=bidirectional, residual=residual, left_pad=left_pad, padding_value=padding_value, max_source_positions=max_source_positions, ) receptive_field_radius = sum(conv.padding[0] for conv in conv_layers_before.convolutions) \ if conv_layers_before is not None else 0 assert chunk_width is None or chunk_width > 0 assert (conv_layers_before is None and chunk_left_context >= 0) or \ (conv_layers_before is not None and chunk_left_context >= receptive_field_radius) self.out_chunk_begin = self.output_lengths(chunk_left_context + 1) - 1 self.out_chunk_end = self.output_lengths(chunk_left_context + chunk_width) \ if chunk_width is not None else None self.training_stage = training_stage # only for encoder-only model self.fc_out = Linear(self.output_units, num_targets, dropout=dropout_out) \ if num_targets is not None else None
def __init__( self, input_size, output_size, hidden_sizes=256, kernel_sizes=3, strides=1, dilations=3, num_layers=1, dropout_in=0.0, dropout_out=0.0, residual=False, chunk_width=None, chunk_left_context=0, training_stage=True, ): super().__init__(None) # no src dictionary self.num_layers = num_layers if isinstance(hidden_sizes, int): hidden_sizes = [hidden_sizes] * num_layers else: assert len(hidden_sizes) == num_layers if isinstance(kernel_sizes, int): kernel_sizes = [kernel_sizes] * num_layers else: assert len(kernel_sizes) == num_layers if isinstance(strides, int): strides = [strides] * num_layers else: assert len(strides) == num_layers if isinstance(dilations, int): dilations = [dilations] * num_layers else: assert len(dilations) == num_layers self.dropout_in_module = FairseqDropout(dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout(dropout_out, module_name=self.__class__.__name__) self.residual = residual self.tdnn = nn.ModuleList([ TdnnBNReLU( in_channels=input_size if layer == 0 else hidden_sizes[layer - 1], out_channels=hidden_sizes[layer], kernel_size=kernel_sizes[layer], stride=strides[layer], dilation=dilations[layer], ) for layer in range(num_layers) ]) receptive_field_radius = sum(layer.padding for layer in self.tdnn) assert chunk_width is None or (chunk_width > 0 and chunk_left_context >= receptive_field_radius) if ( chunk_width is not None and chunk_width > 0 and chunk_left_context > receptive_field_radius ): logger.warning("chunk_{{left,right}}_context can be reduced to {}".format(receptive_field_radius)) self.out_chunk_begin = self.output_lengths(chunk_left_context + 1) - 1 self.out_chunk_end = self.output_lengths(chunk_left_context + chunk_width) \ if chunk_width is not None else None self.training_stage = training_stage self.fc_out = Linear(hidden_sizes[-1], output_size, dropout=self.dropout_out_module.p)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units assert encoder_output_units == hidden_size, \ 'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size) # TODO another Linear layer if not equal self.layers = nn.ModuleList([ LSTMCell( input_size=encoder_output_units + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) self.attention_1 = AttentionLayer(encoder_output_units, hidden_size) if attention else None self.attention_2 = AttentionLayer(encoder_output_units, hidden_size) if attention else None # self.attention_combine_fc = Linear(2 * hidden_size, hidden_size) if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, encoder_output_units=0, attn_type=None, attn_dim=0, need_attn=False, residual=False, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, scheduled_sampling_rate_scheduler=None, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out * 1.0, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed if attn_type is None or str(attn_type).lower() == "none": # no attention, no encoder output needed (language model case) need_attn = False encoder_output_units = 0 self.need_attn = need_attn self.residual = residual self.max_target_positions = max_target_positions self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units self.layers = nn.ModuleList( [ LSTMCell( input_size=encoder_output_units + (embed_dim if layer == 0 else hidden_size), hidden_size=hidden_size, ) for layer in range(num_layers) ] ) if attn_type is None or str(attn_type).lower() == "none": self.attention = None elif str(attn_type).lower() == "bahdanau": self.attention = speech_attention.BahdanauAttention( hidden_size, encoder_output_units, attn_dim, ) elif str(attn_type).lower() == "luong": self.attention = speech_attention.LuongAttention( hidden_size, encoder_output_units, ) else: raise ValueError("unrecognized attention type.") if hidden_size + encoder_output_units != out_embed_dim: self.additional_fc = Linear( hidden_size + encoder_output_units, out_embed_dim ) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
def __init__(self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, use_scratchpad=False, residual=False): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.use_scratchpad = use_scratchpad self.residual = residual self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None self.layers = nn.ModuleList([ LSTMCell( input_size=hidden_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, embed_dim, adaptive_softmax_cutoff, dropout=dropout_out) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) #EDITED if self.use_scratchpad: self.attentive_writer = AttentiveWriter(hidden_size, encoder_output_units, encoder_output_units)
def __init__(self, dictionary, embed_tokens, embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=False, padding_value=0., adaptive_softmax=False, adaptive_softmax_cutoff=[], adaptive_softmax_dropout=0.1, adaptive_softmax_factor=None): super(LSTMTaggerDecoder, self).__init__(dictionary=dictionary) if hasattr(embed_tokens, "embedded_dim"): self.in_embed_dim = embed_tokens.embedded_dim elif hasattr(embed_tokens, "embed_dim"): self.in_embed_dim = embed_tokens.embed_dim elif hasattr(embed_tokens, "embedding_dim"): self.in_embed_dim = embed_tokens.embedding_dim else: raise Exception self.output_units = self.embed_dim = embed_dim self.out_embed_dim = len(dictionary) self.num_layers = num_layers self.dropout_in = dropout_in self.dropout_out = dropout_out self.bidirectional = bidirectional if self.bidirectional: #self.output_units *= 2 pass self.padding_idx = dictionary.pad() self.padding_value = 0. self.left_pad = left_pad self.embed_tokens = embed_tokens self.fc_in = self.fc_out1 = self.fc_out2 = None if self.in_embed_dim != self.embed_dim: self.fc_in = Linear(self.in_embed_dim, self.embed_dim) if self.output_units != self.embed_dim: self.fc_out1 = Linear(self.output_units, self.embed_dim) if self.embed_dim != self.out_embed_dim: self.fc_out2 = Linear(self.embed_dim, self.out_embed_dim) self.lstm = LSTM( input_size=embed_dim, hidden_size=embed_dim, num_layers=num_layers, dropout=self.dropout_out if num_layers > 1 else 0., bidirectional=bidirectional, ) self.adaptive_softmax = None if adaptive_softmax: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.embed_dim, adaptive_softmax_cutoff, dropout=adaptive_softmax_dropout, adaptive_inputs=None, factor=adaptive_softmax_factor, tie_proj=False, )
def __init__( self, dictionary, rnn_type="lstm", embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="luong-dot", encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = torch.nn.Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = torch.nn.Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size # For Bahdanau, we compute the context on the input feed bahd_factor = hidden_size \ if attention_type in ["bahdanau-dot", "bahdanau-concat", "bahdanau-general", "bahdanau"] \ else 0 self.rnn_type = rnn_type if rnn_type == "lstm": self.layers = LSTM( input_size=input_feed_size + embed_dim + bahd_factor, hidden_size=hidden_size, num_layers=num_layers ) else: self.layers = GRU( input_size=input_feed_size + embed_dim + bahd_factor, hidden_size=hidden_size, num_layers=num_layers ) if attention_type == "none": self.attention_type = "none" self.attention = None else: self.attention_type = attention_type self.attention = Attention(self.attention_type, hidden_size) if hidden_size != out_embed_dim: self.additional_fc = torch.nn.Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def __init__( self, rnn_type: Union[str, Namespace], dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, attention_bias=False, # todo encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) rnn_type = rnn_type.rnn_type if isinstance(rnn_type, Namespace) else rnn_type self.rnn_type = rnn_type.lower().strip() self.is_lstm = True if self.rnn_type == 'lstm' else False # 方便后面做判断,以便处理lstm的 cell值 self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = JqEmbedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear( encoder_output_units, hidden_size) if self.is_lstm else None else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size _, JQRNNCell = get_rnn_cell(self.rnn_type) # 返回的是方法 # _ JQRNN self.layers = nn.ModuleList([ JQRNNCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ]) if attention: # TODO make bias configurable: Done self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=attention_bias) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)