Exemplo n.º 1
0
    def __init__(self,
                 hidden_size: int = 512,
                 emb_size: int = 512,
                 kernel_size: int = 5,
                 num_layers: int = 15,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initializes the ConvSeq2Seq Encoder.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(ConvSeq2SeqEncoder, self).__init__()

        # build all (num_layers) layers
        self.layers = nn.ModuleList([
            ConvSeq2SeqEncoderLayer(hidden_size=hidden_size,
                                    kernel_size=kernel_size,
                                    dropout=dropout) for _ in range(num_layers)
        ])

        self.absPE = AbsolutePositionalEncoding(emb_size)
        self.emb2hidden = nn.Linear(emb_size, hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        if freeze:
            freeze_params(self)
Exemplo n.º 2
0
    def __init__(self,
                 embedding_dim: int = 64,
                 scale: bool = False,
                 vocab_size: int = 0,
                 padding_idx: int = 1,
                 freeze: bool = False,
                 **kwargs):
        """
        Create new embeddings for the vocabulary.
        Use scaling for the Transformer.

        :param embedding_dim:
        :param scale:
        :param vocab_size:
        :param padding_idx:
        :param freeze: freeze the embeddings during training
        """
        super(Embeddings, self).__init__()

        self.embedding_dim = embedding_dim
        self.scale = scale
        self.vocab_size = vocab_size
        self.lut = nn.Embedding(vocab_size,
                                self.embedding_dim,
                                padding_idx=padding_idx)

        if freeze:
            freeze_params(self)
Exemplo n.º 3
0
    def __init__(self,
                 num_layers: int = 4,
                 num_heads: int = 8,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 vocab_size: int = 1,
                 freeze: bool = False,
                 self_attn_func: str = "softmax",
                 src_attn_func: str = "softmax",
                 self_attn_alpha: float = 1.5,
                 src_attn_alpha: float = 1.5,
                 gen_func: str = "softmax",
                 gen_alpha: float = 1.5,
                 output_bias: bool = False,
                 **kwargs):
        """
        Initialize a Transformer decoder.

        :param num_layers: number of Transformer layers
        :param num_heads: number of heads for each layer
        :param hidden_size: hidden size
        :param ff_size: position-wise feed-forward size
        :param dropout: dropout probability (1-keep)
        :param emb_dropout: dropout probability for embeddings
        :param vocab_size: size of the output vocabulary
        :param freeze: set to True keep all decoder parameters fixed
        :param kwargs:
        """
        super(TransformerDecoder, self).__init__(hidden_size,
                                                 vocab_size,
                                                 emb_dropout,
                                                 gen_func=gen_func,
                                                 gen_alpha=gen_alpha,
                                                 output_bias=output_bias)

        # create num_layers decoder layers and put them in a list
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(size=hidden_size,
                                    ff_size=ff_size,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    self_attn_func=self_attn_func,
                                    self_attn_alpha=self_attn_alpha,
                                    src_attn_func=src_attn_func,
                                    src_attn_alpha=src_attn_alpha)
            for _ in range(num_layers)
        ])

        self.pe = PositionalEncoding(hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)

        if freeze:
            freeze_params(self)
Exemplo n.º 4
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 dont_minus_one=True,
                 shared_layers=None,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        # build all (num_layers) layers
        if shared_layers is not None:
            #self.layers = shared_layers
            self.layers = nn.ModuleList([layer for layer in shared_layers])
            self.layers.append(
                TransformerEncoderLayer(size=hidden_size,
                                        ff_size=ff_size,
                                        num_heads=num_heads,
                                        dropout=dropout))
        else:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size,
                                        ff_size=ff_size,
                                        num_heads=num_heads,
                                        dropout=dropout)
                for _ in range(num_layers if dont_minus_one else num_layers -
                               1)
            ])
        self.top_off = False if (dont_minus_one
                                 and 'multi_encoder' in locals()
                                 and multi_encoder) else True
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 5
0
    def __init__(
            self,
            num_layers: int = 4,
            num_heads: int = 8,
            hidden_size: int = 512,
            ff_size: int = 2048,
            dropout: float = 0.1,
            freeze: bool = False,
            self_attn_func: str = "softmax",
            src_attn_func: str = "softmax",
            self_attn_alpha: float = 1.5,
            src_attn_alpha: float = 1.5,
            merge: str = "serial",  # for multi-encoder models
            gate_func: str = "softmax",
            gate_alpha: float = 1.5,
            **kwargs):
        """
        Initialize a Transformer decoder.

        :param num_layers:
        :param num_heads:
        :param hidden_size: hidden size
        :param ff_size: position-wise feed-forward size
        :param dropout:
        :param emb_dropout: dropout probability for embeddings
        :param freeze: set to True keep all decoder parameters fixed
        :param kwargs: passed to generic Decoder Constructor
        """
        super(TransformerDecoder, self).__init__(hidden_size, **kwargs)

        self.layers = nn.ModuleList([
            self.layer_module(size=hidden_size,
                              ff_size=ff_size,
                              num_heads=num_heads,
                              dropout=dropout,
                              self_attn_func=self_attn_func,
                              self_attn_alpha=self_attn_alpha,
                              src_attn_func=src_attn_func,
                              src_attn_alpha=src_attn_alpha,
                              merge=merge,
                              gate_func=gate_func,
                              gate_alpha=gate_alpha) for _ in range(num_layers)
        ])

        self.pe = PositionalEncoding(hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)

        if freeze:
            freeze_params(self)
Exemplo n.º 6
0
    def __init__(self,
                 embed_file: str,
                 vocab: Vocabulary,
                 embedding_dim: int = 64,
                 scale: bool = False,
                 vocab_size: int = 0,
                 padding_idx: int = 1,
                 freeze: bool = True,
                 **kwargs):
        super(PretrainedEmbeddings, self).__init__(embedding_dim, scale, vocab_size, padding_idx, freeze, **kwargs)

        # overwrite lut with embeddings from embed_file
        self.load_embeddings_from_file(embed_file, vocab)

        if freeze:
            freeze_params(self)
Exemplo n.º 7
0
    def __init__(self,
                 num_layers: int = 4,
                 num_heads: int = 8,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 vocab_size: int = 1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initialize a Transformer decoder.

        :param num_layers: number of Transformer layers
        :param num_heads: number of heads for each layer
        :param hidden_size: hidden size
        :param ff_size: position-wise feed-forward size
        :param dropout: dropout probability (1-keep)
        :param emb_dropout: dropout probability for embeddings
        :param vocab_size: size of the output vocabulary
        :param freeze: set to True keep all decoder parameters fixed
        :param kwargs:
        """
        super().__init__()

        self._hidden_size = hidden_size
        self._output_size = vocab_size

        # create num_layers decoder layers and put them in a list
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(size=hidden_size,
                                    ff_size=ff_size,
                                    num_heads=num_heads,
                                    dropout=dropout) for _ in range(num_layers)
        ])

        self.pe = PositionalEncoding(hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)

        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self.output_layer = nn.Linear(hidden_size, hidden_size, bias=False)
        #self.output_layer = self.layers[-1].feed_forward.pwff_layer[-2]

        if freeze:
            freeze_params(self)
Exemplo n.º 8
0
    def __init__(self,
                 num_layers: int = 4,
                 num_heads: int = 8,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 dropout: float = 0.1,
                 vocab_size: int = 1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initialize a Transformer decoder.

        :param num_layers: number of Transformer layers
        :param num_heads: number of heads for each layer
        :param hidden_size: hidden size
        :param ff_size: position-wise feed-forward size
        :param dropout: dropout probability (1-keep)
        :param vocab_size: size of the output vocabulary
        :param freeze: set to True keep all decoder parameters fixed
        :param kwargs:
        """
        super(TransformerDecoder, self).__init__()

        # build all (num_layers) layers
        layers = []
        for _ in range(num_layers):
            layer = TransformerDecoderLayer(
                hidden_size,
                MultiHeadedAttention(num_heads, hidden_size, dropout),
                MultiHeadedAttention(num_heads, hidden_size, dropout),
                PositionwiseFeedForward(hidden_size, ff_size, dropout),
                dropout)
            layers.append(layer)

        self.layers = nn.ModuleList(layers)
        self.norm = nn.LayerNorm(hidden_size)
        self.pe = PositionalEncoding(hidden_size, dropout=dropout)

        self._hidden_size = hidden_size
        self._output_size = vocab_size

        self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False)

        if freeze:
            freeze_params(self)
Exemplo n.º 9
0
    def __init__(self,
                 num_layers: int = 4,
                 hidden_size: int = 512,
                 emb_size: int = 512,
                 kernel_size: int = 5,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 vocab_size: int = 1,
                 freeze: bool = False,
                 use_multi_head: bool = False,
                 num_heads: int = 8,
                 **kwargs):
        """
        Initialize a ConvSeq2Seq decoder.

        :param num_layers: number of Transformer layers
        :param hidden_size: hidden size
        :param dropout: dropout probability (1-keep)
        :param emb_dropout: dropout probability for embeddings
        :param vocab_size: size of the output vocabulary
        :param freeze: set to True keep all decoder parameters fixed
        :param kwargs:
        """
        super(ConvSeq2SeqDecoder, self).__init__()

        self._hidden_size = hidden_size
        self._output_size = vocab_size

        # create num_layers decoder layers and put them in a list
        self.layers = nn.ModuleList([ConvSeq2SeqDecoderLayer(
                                        hidden_size=hidden_size,
                                        embedding_size=emb_size,
                                        kernel_size=kernel_size,
                                        use_multi_head=use_multi_head,
                                        num_heads=num_heads,
                                        dropout=dropout) for _ in range(num_layers)])

        self.absPE = AbsolutePositionalEncoding(emb_size)
        self.emb2hidden = nn.Linear(emb_size, hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)
        self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False)

        if freeze:
            freeze_params(self)
Exemplo n.º 10
0
    def __init__(self,
                 rnn_type: str = "gru",
                 hidden_size: int = 1,
                 emb_size: int = 1,
                 num_layers: int = 1,
                 dropout: float = 0.,
                 emb_dropout: float = 0.,
                 bidirectional: bool = True,
                 freeze: bool = False,
                 enforce_sorted: bool = True,
                 **kwargs) -> None:
        """
        Create a new recurrent encoder.

        :param rnn_type: RNN type: `gru` or `lstm`.
        :param hidden_size: Size of each RNN.
        :param emb_size: Size of the word embeddings.
        :param num_layers: Number of encoder RNN layers.
        :param dropout:  Is applied between RNN layers.
        :param emb_dropout: Is applied to the RNN input (word embeddings).
        :param bidirectional: Use a bi-directional RNN.
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """

        self._enforce_sorted = enforce_sorted
        super(RecurrentEncoder, self).__init__()

        self.emb_dropout = nn.Dropout(p=emb_dropout, inplace=False)
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.rnn = rnn(emb_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       bidirectional=bidirectional,
                       dropout=dropout if num_layers > 1 else 0.)

        self._output_size = 2 * hidden_size if bidirectional else hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 11
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 active_layers: list = [],
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super().__init__()

        # build all (num_layers) layers or only some
        if active_layers != []:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size, ff_size=ff_size,
                                        num_heads=num_heads, dropout=dropout)
                for x in active_layers])
        else:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size, ff_size=ff_size,
                                        num_heads=num_heads, dropout=dropout)
                for _ in range(num_layers)])
            
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 12
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 attn_func: str = "softmax",
                 attn_alpha: float = 1.5,
                 pe: bool = True,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(size=hidden_size,
                                    ff_size=ff_size,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    attn_func=attn_func,
                                    attn_alpha=attn_alpha)
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size) if pe else None
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 13
0
    def __init__(self,
                 rnn_type: str = "gru",
                 hidden_size: int = 1,
                 emb_size: int = 1,
                 num_layers: int = 1,
                 dropout: float = 0.,
                 bidirectional: bool = True,
                 freeze: bool = False,
                 **kwargs) -> None:
        """
        Create a new recurrent encoder.

        :param rnn_type:
        :param hidden_size:
        :param emb_size:
        :param num_layers:
        :param dropout:
        :param bidirectional:
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """

        super(RecurrentEncoder, self).__init__()

        self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False)
        self.type = rnn_type
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.rnn = rnn(emb_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       bidirectional=bidirectional,
                       dropout=dropout if num_layers > 1 else 0.)

        self._output_size = 2 * hidden_size if bidirectional else hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 14
0
    def __init__(self,
                 embedding_dim: int = 64,
                 scale: bool = False,
                 vocab_size: int = 0,
                 padding_idx: int = 1,
                 freeze: bool = False,
                 from_pretrained: bool = False,
                 pretrained_path: str = "",
                 check_embedding=False,
                 **kwargs):
        """
        Create new embeddings for the vocabulary.
        Use scaling for the Transformer.

        :param embedding_dim:
        :param scale:
        :param vocab_size:
        :param padding_idx:
        :param freeze: freeze the embeddings during training
        """
        super(Embeddings, self).__init__()

        self.embedding_dim = embedding_dim
        self.scale = scale
        self.vocab_size = vocab_size
        self.from_pretrained = from_pretrained
        self.check_embedding = check_embedding
        if from_pretrained:
            print("using pretrained model")
            self.weight = torch.load(pretrained_path)
            print(f"loaded embeddings size {self.weight.shape}")
            self.lut = nn.Embedding.from_pretrained(self.weight)
        else:
            self.lut = nn.Embedding(vocab_size,
                                    self.embedding_dim,
                                    padding_idx=padding_idx)
        if freeze:
            freeze_params(self)
Exemplo n.º 15
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        # build all (num_layers) layers
        layers = []
        for _ in range(num_layers):
            layer = TransformerEncoderLayer(
                hidden_size,
                MultiHeadedAttention(num_heads, hidden_size, dropout),
                PositionwiseFeedForward(hidden_size, ff_size, dropout),
                dropout)
            layers.append(layer)

        self.layers = nn.ModuleList(layers)
        self.norm = nn.LayerNorm(hidden_size)
        self.pe = PositionalEncoding(hidden_size, dropout=dropout)
        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 16
0
    def __init__(self,
                 rnn_type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder_output_sizes: dict = None,
                 attention: str = "bahdanau",
                 attn_merge: str = "concat",
                 num_layers: int = 1,
                 dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 init_hidden: str = "bridge",
                 input_feeding: bool = True,
                 freeze: bool = False,
                 attn_func: str = "softmax",
                 attn_alpha: float = 1.5,
                 gate_func: str = "softmax",
                 gate_alpha: float = 1.5,
                 **kwargs) -> None:
        """
        Todo: document the unique challenges of making an RNN decoder that
        attends over multiple encoders
        """

        super(MultiHeadRecurrentDecoder, self).__init__(hidden_size, **kwargs)

        self.hidden_dropout = nn.Dropout(p=hidden_dropout)

        self.head_names = sorted(encoder_output_sizes)

        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        input_size = emb_size + hidden_size if input_feeding else emb_size

        # the decoder RNN
        self.rnn = rnn(input_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combined output sizes of all encoders
        # this quantity matters for concat attention merging
        # it also matters if you have a bridge for init_hidden
        encoder_output_size = sum(encoder_output_sizes.values())

        assert attention in ["bahdanau", "luong"], \
            "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'."
        if attention == "bahdanau":
            attn_mechanism = partial(MultiAttention, query_size=hidden_size)
        else:
            attn_mechanism = MultiAttention
        self.attention = attn_mechanism(attn_type=attention,
                                        head_names=self.head_names,
                                        key_sizes=encoder_output_sizes,
                                        hidden_size=hidden_size,
                                        attn_func=attn_func,
                                        attn_alpha=attn_alpha,
                                        attn_merge=attn_merge,
                                        gate_func=gate_func,
                                        gate_alpha=gate_alpha)

        # to initialize from the final encoder state of last layer
        assert init_hidden == "bridge", \
            "only use bridge with multi-encoder models"

        self.bridge_layer = nn.Sequential(
            nn.Linear(encoder_output_size, hidden_size, bias=True), nn.Tanh())

        if freeze:
            freeze_params(self)
Exemplo n.º 17
0
    def __init__(self,
                 rnn_type: str = "gru",
                 hidden_size: int = 1,
                 emb_size: int = 1,
                 num_layers: int = 1,
                 dropout: float = 0.,
                 bidirectional: bool = True,
                 freeze: bool = False,
                 activation: str = "relu",
                 last_activation: str = "None",
                 layer_norm: bool = False,
                 emb_norm: bool = False,
                 same_weights: bool = False,
                 **kwargs) -> None:
        """
        Create a new recurrent encoder.

        :param rnn_type:
        :param hidden_size:
        :param emb_size:
        :param num_layers:
        :param dropout:
        :param bidirectional:
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """

        super(SpeechRecurrentEncoder, self).__init__()

        self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False)
        self.type = rnn_type
        self.emb_size = emb_size
        self.lila1 = nn.Linear(emb_size, hidden_size)
        self.lila2 = nn.Linear(hidden_size, hidden_size)
        self.same_weights = same_weights
        if not self.same_weights:
            self.lila3 = nn.Linear(hidden_size, hidden_size)
            self.lila4 = nn.Linear(hidden_size, hidden_size)
        self.activation = activation
        self.last_activation = last_activation
        self.conv1 = nn.Sequential(
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0))
        self.conv2 = nn.Sequential(
            nn.Conv1d(hidden_size, hidden_size, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0))
        self.layer_norm = layer_norm
        self.emb_norm = emb_norm
        if self.layer_norm:
            self.norm1 = nn.LayerNorm(hidden_size)
            self.norm2 = nn.LayerNorm(hidden_size)
            self.norm_out = nn.LayerNorm(2 * hidden_size if bidirectional else hidden_size)
        if self.emb_norm:
            self.norm_emb = nn.LayerNorm(emb_size)

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.rnn = rnn(
            hidden_size, hidden_size, num_layers, batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.)

        self._output_size = 2 * hidden_size if bidirectional else hidden_size

        if freeze:
            freeze_params(self)
Exemplo n.º 18
0
    def __init__(self,
                 rnn_type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder: Encoder = None,
                 attention: str = "bahdanau",
                 num_layers: int = 1,
                 vocab_size: int = 0,
                 dropout: float = 0.,
                 emb_dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 init_hidden: str = "bridge",
                 input_feeding: bool = True,
                 freeze: bool = False,
                 **kwargs) -> None:
        """
        Create a recurrent decoder with attention.

        :param rnn_type: rnn type, valid options: "lstm", "gru"
        :param emb_size: target embedding size
        :param hidden_size: size of the RNN
        :param encoder: encoder connected to this decoder
        :param attention: type of attention, valid options: "bahdanau", "luong"
        :param num_layers: number of recurrent layers
        :param vocab_size: target vocabulary size
        :param hidden_dropout: Is applied to the input to the attentional layer.
        :param dropout: Is applied between RNN layers.
        :param emb_dropout: Is applied to the RNN input (word embeddings).
        :param init_hidden: If "bridge" (default), the decoder hidden states are
            initialized from a projection of the last encoder state,
            if "zeros" they are initialized with zeros,
            if "last" they are identical to the last encoder state
            (only if they have the same size)
        :param input_feeding: Use Luong's input feeding.
        :param freeze: Freeze the parameters of the decoder during training.
        :param kwargs:
        """

        super().__init__()

        self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False)
        self.type = rnn_type
        self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False)
        self.hidden_size = hidden_size
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        if self.input_feeding:  # Luong-style
            # combine embedded prev word +attention vector before feeding to rnn
            self.rnn_input_size = emb_size + hidden_size
        else:
            # just feed prev word embedding
            self.rnn_input_size = emb_size

        # the decoder RNN
        self.rnn = rnn(self.rnn_input_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combine output with context vector before output layer (Luong-style)
        self.att_vector_layer = nn.Linear(hidden_size + encoder.output_size,
                                          hidden_size,
                                          bias=True)

        self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
        self._output_size = vocab_size

        if attention == "bahdanau":
            self.attention = BahdanauAttention(hidden_size=hidden_size,
                                               key_size=encoder.output_size,
                                               query_size=hidden_size)
        elif attention == "luong":
            self.attention = LuongAttention(hidden_size=hidden_size,
                                            key_size=encoder.output_size)
        else:
            raise ConfigurationError("Unknown attention mechanism: %s. "
                                     "Valid options: 'bahdanau', 'luong'." %
                                     attention)

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # to initialize from the final encoder state of last layer
        self.init_hidden_option = init_hidden
        if self.init_hidden_option == "bridge":
            self.bridge_layer = nn.Linear(encoder.output_size,
                                          hidden_size,
                                          bias=True)
        elif self.init_hidden_option == "last":
            if encoder.output_size != self.hidden_size:
                if encoder.output_size != 2 * self.hidden_size:  # bidirectional
                    raise ConfigurationError(
                        "For initializing the decoder state with the "
                        "last encoder state, their sizes have to match "
                        "(encoder: {} vs. decoder:  {})".format(
                            encoder.output_size, self.hidden_size))
        if freeze:
            freeze_params(self)
Exemplo n.º 19
0
    def __init__(self,
                 type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder: Encoder = None,
                 attention: str = "bahdanau",
                 num_layers: int = 0,
                 vocab_size: int = 0,
                 dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 bridge: bool = False,
                 input_feeding: bool = True,
                 freeze: bool = False,
                 **kwargs):
        """
        Create a recurrent decoder.
        If `bridge` is True, the decoder hidden states are initialized from a
        projection of the encoder states, else they are initialized with zeros.

        :param type:
        :param emb_size:
        :param hidden_size:
        :param encoder:
        :param attention:
        :param num_layers:
        :param vocab_size:
        :param dropout:
        :param hidden_dropout:
        :param bridge:
        :param input_feeding:
        :param freeze: freeze the parameters of the decoder during training
        :param kwargs:
        """

        super(RecurrentDecoder, self).__init__()

        self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False)
        self.type = type
        self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False)
        self.hidden_size = hidden_size

        rnn = nn.GRU if type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        if self.input_feeding: # Luong-style
            # combine embedded prev word +attention vector before feeding to rnn
            self.rnn_input_size = emb_size + hidden_size
        else:
            # just feed prev word embedding
            self.rnn_input_size = emb_size

        # the decoder RNN
        self.rnn = rnn(self.rnn_input_size, hidden_size, num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combine output with context vector before output layer (Luong-style)
        self.att_vector_layer = nn.Linear(
            hidden_size + encoder.output_size, hidden_size, bias=True)

        self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
        self.output_size = vocab_size

        if attention == "bahdanau":
            self.attention = BahdanauAttention(hidden_size=hidden_size,
                                               key_size=encoder.output_size,
                                               query_size=hidden_size)
        elif attention == "luong":
            self.attention = LuongAttention(hidden_size=hidden_size,
                                            key_size=encoder.output_size)
        else:
            raise ValueError("Unknown attention mechanism: %s" % attention)

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # to initialize from the final encoder state of last layer
        self.bridge = bridge
        if self.bridge:
            self.bridge_layer = nn.Linear(
                encoder.output_size, hidden_size, bias=True)

        if freeze:
            freeze_params(self)
Exemplo n.º 20
0
    def __init__(
            self,
            src_vocab: Vocabulary,
            trg_vocab: Vocabulary,
            embedding_dim: int = 300, # or 30
            scale: bool = False,
            vocab_size: int = 0,
            padding_idx: int = 1,
            freeze: bool = False,
            **kwargs
        ):
        """
        Create new embeddings for the vocabulary.
        Use scaling for the Transformer.
        :param embedding_dim:
        :param scale:
        :param vocab_size:
        :param padding_idx:
        :param freeze: freeze the embeddings during training
        """
        super().__init__()

        self.scale = scale

        # TODO add support for other languages
        # fasttext.util.download_model('de', if_exists='ignore')

        # 30 or 300
        np_embedding = f"ft_deen_{embedding_dim}.np"

        if not os.path.isfile(np_embedding):
            # if else call to avoid importing fasttext if possible
            # because fasttext has unmet dependency on gpu nodes
            import fasttext.util

            src_ft = fasttext.load_model(f'cc.de.{embedding_dim}.bin')
            trg_ft = fasttext.load_model(f'cc.en.{embedding_dim}.bin')

            # Create smaller embeddings, to test on reverse
            # fasttext.util.reduce_model(src_ft, 30)
            # src_ft.save_model('cc.en.30.bin')

            self.embedding_dim = src_ft.get_dimension()

            vectors = []

            for i, word in tqdm(enumerate(src_vocab.itos), desc="adding src vecs"):
                vectors.append(src_ft.get_word_vector(word))
            for i, word in tqdm(enumerate(trg_vocab.itos), desc="adding trg vecs"):
                vectors.append(trg_ft.get_word_vector(word))

            embedding_matrix = np.vstack(vectors)

            with open(np_embedding, "wb") as np_file:
                pickle.dump(embedding_matrix, np_file)

            print(f"Saved joint fasttext embedding matrix as np matrix at {np_embedding}")

        else:

            print("Loading saved embedding ...")

            with open(np_embedding, "rb") as np_file:
                embedding_matrix = pickle.load(np_file)

            print("Loaded saved embedding.")

        self.embedding_dim = embedding_matrix.shape[-1]

        self.lut = nn.Embedding(
            len(src_vocab)+len(trg_vocab),
            self.embedding_dim,
            padding_idx=trg_vocab.stoi[PAD_TOKEN]
        )

        self.lut.weight = nn.Parameter(data=torch.from_numpy(embedding_matrix).float())
        assert self.lut.weight is not None
        # assert False, self.lut.weight.shape

        # always freeze pretrained embeddings
        freeze_params(self)
Exemplo n.º 21
0
    def __init__(self,
                 rnn_type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder_output_size: int = 0,
                 attention: str = "bahdanau",
                 num_layers: int = 1,
                 vocab_size: int = 0,
                 dropout: float = 0.,
                 emb_dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 init_hidden: str = "bridge",
                 input_feeding: bool = True,
                 freeze: bool = False,
                 attn_func: str = "softmax",
                 attn_alpha: float = 1.5,
                 gen_func: str = "softmax",
                 gen_alpha: float = 1.5,
                 output_bias: bool = False,
                 multi_source: bool = False,
                 head_names: list = None,
                 attn_merge: str = "gate",
                 gate_func: str = "softmax",
                 gate_alpha: float = 1.5,
                 **kwargs) -> None:
        """
        Create a recurrent decoder with attention.

        :param rnn_type: rnn type, valid options: "lstm", "gru"
        :param emb_size: target embedding size
        :param hidden_size: size of the RNN
        :param encoder_output_size:
        :param attention: type of attention, valid options: "bahdanau", "luong"
        :param num_layers: number of recurrent layers
        :param vocab_size: target vocabulary size
        :param hidden_dropout: applied to the input to the attentional layer.
        :param dropout: Is applied between RNN layers.
        :param emb_dropout: Is applied to the RNN input (word embeddings).
        :param init_hidden: If "bridge" (default), the decoder hidden states
            are initialized from a projection of the last encoder state,
            if "zeros" they are initialized with zeros,
            if "last" they are identical to the last encoder state
            (only if they have the same size)
        :param input_feeding: Use Luong's input feeding.
        :param freeze: Freeze the parameters of the decoder during training.
        :param kwargs:
        """

        super(RecurrentDecoder, self).__init__(hidden_size,
                                               vocab_size,
                                               emb_dropout,
                                               gen_func=gen_func,
                                               gen_alpha=gen_alpha,
                                               output_bias=output_bias)

        self.multi_source = multi_source

        self.hidden_dropout = nn.Dropout(p=hidden_dropout)
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        input_size = emb_size + hidden_size if input_feeding else emb_size

        # the decoder RNN
        self.rnn = rnn(input_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combine output with context vector before output layer (Luong-style)
        self.att_vector_layer = nn.Linear(hidden_size + encoder_output_size,
                                          hidden_size,
                                          bias=True)

        assert attention in ["bahdanau", "luong"], \
            "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'."

        if multi_source:
            attn_mechanism = partial(MultiAttention,
                                     attn_type=attention,
                                     head_names=head_names,
                                     attn_merge=attn_merge,
                                     gate_func=gate_func,
                                     gate_alpha=gate_alpha)

        elif attention == "luong":
            attn_mechanism = LuongAttention
        else:
            attn_mechanism = BahdanauAttention

        if attention == "bahdanau":
            attn_mechanism = partial(attn_mechanism, query_size=hidden_size)

        self.attention = attn_mechanism(hidden_size=hidden_size,
                                        key_size=encoder_output_size,
                                        attn_func=attn_func,
                                        attn_alpha=attn_alpha)

        # init_hidden: "bridge", "zero", "last", or a dictionary describing
        # an arbitrary-layered MLP
        assert isinstance(init_hidden, dict) or isinstance(init_hidden, str), \
            '''
            Specify either a shortcut name ("bridge", "zero", "last") or a
            dictionary containing a configuration for the bridge layer.
            '''
        if init_hidden == "zero":
            self.bridge_layer = None  # easy-peasy
        else:
            if init_hidden == "last":
                # not actually clear to me if this is necessary
                assert encoder_output_size in {hidden_size, 2 * hidden_size}, \
                    "Mismatched hidden sizes (enc: {}, dec: {})".format(
                        encoder_output_size, hidden_size
                    )
            if isinstance(init_hidden, str):
                bridge = init_hidden == "bridge"
                # 'bridge' and 'last' are shortcuts to specific special cases
                init_hidden = {
                    "num_layers": 1 if bridge else 0,
                    "activation": "tanh" if bridge else "none",
                    "merge": "cat"
                }

            if init_hidden["merge"] == "cat":
                n_heads = len(head_names) if head_names is not None else 1
                bridge_in_size = encoder_output_size * n_heads  # for cat
            else:
                bridge_in_size = encoder_output_size

            self.bridge_layer = Bridge(bridge_in_size,
                                       hidden_size,
                                       lstm=isinstance(self.rnn, nn.LSTM),
                                       decoder_layers=self.num_layers,
                                       **init_hidden)

        if freeze:
            freeze_params(self)
Exemplo n.º 22
0
    def __init__(self,
                 rnn_type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder_output_size: int = 0,
                 attention: str = "bahdanau",
                 num_layers: int = 1,
                 dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 init_hidden: str = "bridge",
                 input_feeding: bool = True,
                 freeze: bool = False,
                 attn_func: str = "softmax",
                 attn_alpha: float = 1.5,
                 **kwargs) -> None:
        """
        Create a recurrent decoder with attention.

        :param rnn_type: rnn type, valid options: "lstm", "gru"
        :param emb_size:
        :param hidden_size:
        :param attention: type of attention, valid options: "bahdanau", "luong"
        :param num_layers:
        :param hidden_dropout: applied to the input to the attentional layer.
        :param dropout: applied between RNN layers.
        :param emb_dropout: applied to the RNN input (word embeddings).
        :param init_hidden: If "bridge" (default), the decoder hidden states
            are initialized from a projection of the last encoder state,
            if "zeros" they are initialized with zeros,
            if "last" they are identical to the last encoder state
            (only if they have the same size)
        :param input_feeding: Use Luong's input feeding.
        :param freeze: Freeze the parameters of the decoder during training.
        :param kwargs: passed to generic Decoder constructor
        """

        super(RecurrentDecoder, self).__init__(hidden_size, **kwargs)

        self.hidden_dropout = nn.Dropout(p=hidden_dropout)
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        input_size = emb_size + hidden_size if input_feeding else emb_size

        # the decoder RNN
        self.rnn = rnn(input_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combine output with context vector before output layer (Luong-style)
        self.att_vector_layer = nn.Linear(hidden_size + encoder_output_size,
                                          hidden_size,
                                          bias=True)

        assert attention in ["bahdanau", "luong"], \
            "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'."
        if attention == "bahdanau":
            attn_mechanism = partial(BahdanauAttention, query_size=hidden_size)
        else:
            attn_mechanism = LuongAttention
        self.attention = attn_mechanism(hidden_size=hidden_size,
                                        key_size=encoder_output_size,
                                        attn_func=attn_func,
                                        attn_alpha=attn_alpha)

        # to initialize from the final encoder state of last layer
        assert init_hidden in ["bridge", "zero", "last"]
        self.init_hidden_option = init_hidden
        if init_hidden == "bridge":
            self.bridge_layer = nn.Sequential(
                nn.Linear(encoder_output_size, hidden_size, bias=True),
                nn.Tanh())
        else:
            self.bridge_layer = None
        if init_hidden == "last":
            out_size = encoder_output_size
            assert out_size in (hidden_size, 2 * hidden_size), \
                "Mismatched hidden sizes (encoder: {}, decoder: {})".format(
                    encoder_output_size, hidden_size
                )

        if freeze:
            freeze_params(self)