def __init__(self, embedding_dim: int = 64, scale: bool = False, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super(Embeddings, self).__init__() self.embedding_dim = embedding_dim self.scale = scale self.vocab_size = vocab_size self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx) if freeze: freeze_params(self)
def __init__(self, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, emb_dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, trg_size: int = 97, decoder_trg_trg_: bool = True, **kwargs): """ Initialize a Transformer decoder. :param num_layers: number of Transformer layers :param num_heads: number of heads for each layer :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: dropout probability (1-keep) :param emb_dropout: dropout probability for embeddings :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super(TransformerDecoder, self).__init__() self._hidden_size = hidden_size # Dynamic output size depending on the target size self._output_size = trg_size # create num_layers decoder layers and put them in a list self.layers = nn.ModuleList([ TransformerDecoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, decoder_trg_trg=decoder_trg_trg_) for _ in range(num_layers) ]) self.pe = PositionalEncoding(hidden_size, mask_count=True) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.emb_dropout = nn.Dropout(p=emb_dropout) # Output layer to be the size of joints vector + 1 for counter (total is trg_size) self.output_layer = nn.Linear(hidden_size, trg_size, bias=False) if freeze: freeze_params(self)
def __init__(self, embedding_dim: int = 64, num_heads: int = 8, scale: bool = False, scale_factor: float = None, norm_type: str = None, activation_type: str = None, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super().__init__() self.embedding_dim = embedding_dim self.vocab_size = vocab_size self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx) self.norm_type = norm_type if self.norm_type: self.norm = MaskedNorm(norm_type=norm_type, num_groups=num_heads, num_features=embedding_dim) self.activation_type = activation_type if self.activation_type: self.activation = get_activation(activation_type) self.scale = scale if self.scale: if scale_factor: self.scale_factor = scale_factor else: self.scale_factor = math.sqrt(self.embedding_dim) if freeze: freeze_params(self)
def __init__( self, rnn_type: str = "gru", hidden_size: int = 1, emb_size: int = 1, num_layers: int = 1, dropout: float = 0.0, emb_dropout: float = 0.0, bidirectional: bool = True, freeze: bool = False, **kwargs ) -> None: """ Create a new recurrent encoder. :param rnn_type: RNN type: `gru` or `lstm`. :param hidden_size: Size of each RNN. :param emb_size: Size of the word embeddings. :param num_layers: Number of encoder RNN layers. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param bidirectional: Use a bi-directional RNN. :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(RecurrentEncoder, self).__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.rnn = rnn( emb_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0.0, ) self._output_size = 2 * hidden_size if bidirectional else hidden_size if freeze: freeze_params(self)
def __init__( self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, **kwargs ): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers self.layers = nn.ModuleList( [ TransformerEncoderLayer( size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, ) for _ in range(num_layers) ] ) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__( self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder: Encoder = None, attention: str = "bahdanau", num_layers: int = 1, vocab_size: int = 0, dropout: float = 0.0, emb_dropout: float = 0.0, hidden_dropout: float = 0.0, init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, **kwargs ) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: target embedding size :param hidden_size: size of the RNN :param encoder: encoder connected to this decoder :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: number of recurrent layers :param vocab_size: target vocabulary size :param hidden_dropout: Is applied to the input to the attentional layer. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: """ super(RecurrentDecoder, self).__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False) self.hidden_size = hidden_size self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding if self.input_feeding: # Luong-style # combine embedded prev word +attention vector before feeding to rnn self.rnn_input_size = emb_size + hidden_size else: # just feed prev word embedding self.rnn_input_size = emb_size # the decoder RNN self.rnn = rnn( self.rnn_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0, ) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear( hidden_size + encoder.output_size, hidden_size, bias=True ) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) self._output_size = vocab_size if attention == "bahdanau": self.attention = BahdanauAttention( hidden_size=hidden_size, key_size=encoder.output_size, query_size=hidden_size, ) elif attention == "luong": self.attention = LuongAttention( hidden_size=hidden_size, key_size=encoder.output_size ) else: raise ValueError( "Unknown attention mechanism: %s. " "Valid options: 'bahdanau', 'luong'." % attention ) self.num_layers = num_layers self.hidden_size = hidden_size # to initialize from the final encoder state of last layer self.init_hidden_option = init_hidden if self.init_hidden_option == "bridge": self.bridge_layer = nn.Linear(encoder.output_size, hidden_size, bias=True) elif self.init_hidden_option == "last": if encoder.output_size != self.hidden_size: if encoder.output_size != 2 * self.hidden_size: # bidirectional raise ValueError( "For initializing the decoder state with the " "last encoder state, their sizes have to match " "(encoder: {} vs. decoder: {})".format( encoder.output_size, self.hidden_size ) ) if freeze: freeze_params(self)
def build_model( cfg: dict, sgn_dim: int, gls_vocab: GlossVocabulary, txt_vocab: TextVocabulary, do_recognition: bool = True, do_translation: bool = True, ) -> SignModel: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param sgn_dim: feature dimension of the sign frame representation, i.e. 2560 for EfficientNet-7. :param gls_vocab: sign gloss vocabulary :param txt_vocab: spoken language word vocabulary :return: built and initialized model :param do_recognition: flag to build the model with recognition output. :param do_translation: flag to build the model with translation decoder. """ txt_padding_idx = txt_vocab.stoi[PAD_TOKEN] gcn = cfg.get("gcn", False) if gcn: sgn_embed = GCNEmbeddings( **cfg["encoder"]["embeddings"], num_heads=cfg["encoder"]["num_heads"], input_size=sgn_dim, ) else: sgn_embed: SpatialEmbeddings = SpatialEmbeddings( **cfg["encoder"]["embeddings"], num_heads=cfg["encoder"]["num_heads"], input_size=sgn_dim, ) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.0) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert ( cfg["encoder"]["embeddings"]["embedding_dim"] == cfg["encoder"]["hidden_size"] ), "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) else: encoder = RecurrentEncoder( **cfg["encoder"], emb_size=sgn_embed.embedding_dim, emb_dropout=enc_emb_dropout, ) if do_recognition: gloss_output_layer = nn.Linear(encoder.output_size, len(gls_vocab)) if cfg["encoder"].get("freeze", False): freeze_params(gloss_output_layer) else: gloss_output_layer = None # build decoder and word embeddings if do_translation: txt_embed: Union[Embeddings, None] = Embeddings( **cfg["decoder"]["embeddings"], num_heads=cfg["decoder"]["num_heads"], vocab_size=len(txt_vocab), padding_idx=txt_padding_idx, ) dec_dropout = cfg["decoder"].get("dropout", 0.0) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: decoder = RecurrentDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(txt_vocab), emb_size=txt_embed.embedding_dim, emb_dropout=dec_emb_dropout, ) else: txt_embed = None decoder = None model: SignModel = SignModel( encoder=encoder, gloss_output_layer=gloss_output_layer, decoder=decoder, sgn_embed=sgn_embed, txt_embed=txt_embed, gls_vocab=gls_vocab, txt_vocab=txt_vocab, do_recognition=do_recognition, do_translation=do_translation, ) if do_translation: # tie softmax layer with txt embeddings if cfg.get("tied_softmax", False): # noinspection PyUnresolvedReferences if txt_embed.lut.weight.shape == model.decoder.output_layer.weight.shape: # (also) share txt embeddings and softmax layer: # noinspection PyUnresolvedReferences model.decoder.output_layer.weight = txt_embed.lut.weight else: raise ValueError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer." ) # custom initialization of model parameters initialize_model(model, cfg, txt_padding_idx) return model