Exemplo n.º 1
0
    def __init__(self, modules, consts, options):
        super(Model, self).__init__()

        self.has_learnable_w2v = options["has_learnable_w2v"]
        self.is_predicting = options["is_predicting"]
        self.is_bidirectional = options["is_bidirectional"]
        self.beam_decoding = options["beam_decoding"]
        self.cell = options["cell"]
        self.device = options["device"]
        self.copy = options["copy"]
        self.coverage = options["coverage"]
        self.avg_nll = options["avg_nll"]

        self.dim_x = consts["dim_x"]
        self.dim_y = consts["dim_y"]
        self.len_x = consts["len_x"]
        self.len_y = consts["len_y"]
        self.hidden_size = consts["hidden_size"]
        self.dict_size = consts["dict_size"]
        self.pad_token_idx = consts["pad_token_idx"]
        self.ctx_size = self.hidden_size * 2 if self.is_bidirectional else self.hidden_size
        self.num_layers = consts["num_layers"]
        self.d_ff = consts["d_ff"]
        self.num_heads = consts["num_heads"]
        self.dropout = consts["dropout"]
        self.smoothing_factor = consts["label_smoothing"]

        self.tok_embed = nn.Embedding(self.dict_size, self.dim_x,
                                      self.pad_token_idx)
        self.pos_embed = LearnedPositionalEmbedding(self.dim_x,
                                                    device=self.device)

        self.enc_layers = nn.ModuleList()
        for i in range(self.num_layers):
            self.enc_layers.append(
                TransformerLayer(self.dim_x, self.d_ff, self.num_heads,
                                 self.dropout))

        self.dec_layers = nn.ModuleList()
        for i in range(self.num_layers):
            self.dec_layers.append(
                TransformerLayer(self.dim_x,
                                 self.d_ff,
                                 self.num_heads,
                                 self.dropout,
                                 with_external=True))

        self.attn_mask = SelfAttentionMask(device=self.device)

        self.emb_layer_norm = LayerNorm(self.dim_x)

        self.word_prob = WordProbLayer(self.hidden_size, self.dict_size,
                                       self.device, self.copy, self.coverage,
                                       self.dropout)

        self.smoothing = LabelSmoothing(self.device, self.dict_size,
                                        self.pad_token_idx,
                                        self.smoothing_factor)

        self.init_weights()
Exemplo n.º 2
0
 def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim,
              num_heads, dropout, num_layers):
     super(Ranker, self).__init__()
     self.transformer_src = nn.ModuleList()
     self.transformer_tgt = nn.ModuleList()
     for i in range(num_layers):
         self.transformer_src.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
         self.transformer_tgt.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
     self.embed_dim = embed_dim
     self.embed_scale = math.sqrt(embed_dim)
     self.embed_positions = SinusoidalPositionalEmbedding(embed_dim)
     self.embed_src = Embedding(vocab_src.size, embed_dim,
                                vocab_src.padding_idx)
     self.embed_tgt = Embedding(vocab_tgt.size, embed_dim,
                                vocab_tgt.padding_idx)
     self.absorber_src = Parameter(torch.Tensor(embed_dim))
     self.absorber_tgt = Parameter(torch.Tensor(embed_dim))
     self.attention_src = MultiheadAttention(embed_dim,
                                             1,
                                             dropout,
                                             weights_dropout=False)
     self.attention_tgt = MultiheadAttention(embed_dim,
                                             1,
                                             dropout,
                                             weights_dropout=False)
     self.scorer = nn.Linear(embed_dim, embed_dim)
     self.dropout = dropout
     self.vocab_src = vocab_src
     self.vocab_tgt = vocab_tgt
     self.reset_parameters()
Exemplo n.º 3
0
 def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim,
              num_heads, dropout, num_layers):
     super(Masker, self).__init__()
     self.transformer_src = nn.ModuleList()
     self.transformer_tgt = nn.ModuleList()
     for i in range(num_layers):
         self.transformer_src.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
         self.transformer_tgt.append(
             TransformerLayer(embed_dim,
                              ff_embed_dim,
                              num_heads,
                              dropout,
                              with_external=True))
     self.embed_dim = embed_dim
     self.embed_scale = math.sqrt(embed_dim)
     self.embed_positions = SinusoidalPositionalEmbedding(embed_dim)
     self.embed_src = Embedding(vocab_src.size, embed_dim,
                                vocab_src.padding_idx)
     self.embed_tgt = Embedding(vocab_tgt.size, embed_dim,
                                vocab_tgt.padding_idx)
     self.masker = nn.Linear(embed_dim, 1)
     self.dropout = dropout
     self.vocab_src = vocab_src
     self.vocab_tgt = vocab_tgt
     self.reset_parameters()
Exemplo n.º 4
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx):
        super(BERTLM, self).__init__()
        self.vocab = vocab
        self.embed_dim =embed_dim
        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        self.seg_embed = Embedding(2, embed_dim, None)

        self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size))

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) 
        self.nxt_snt_pred = nn.Linear(embed_dim, 1)
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
Exemplo n.º 5
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads,
                 dropout, layers):
        super(BERTLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim
        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)
        self.seg_embed = Embedding(2, embed_dim, None)

        self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size))

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
        self.emb_layer_norm = nn.LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = nn.LayerNorm(embed_dim)
        self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim)
        self.nxt_snt_pred = nn.Linear(embed_dim, 1)
        self.dropout = dropout
        self.device = local_rank
        self.reset_parameters()
Exemplo n.º 6
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        
        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)
        
        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor)
       
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
Exemplo n.º 7
0
    def __init__(self,
                 dim,
                 src_n_vocab,
                 n_encod_layer,
                 tgt_n_vocab,
                 n_decode_layer,
                 max_len=512):
        self.src_emb = EmbeddingWithPositionalEncoding(dim, src_n_vocab,
                                                       max_len)
        self.tgt_emb = EmbeddingWithLearnedPositionalEncoding(
            dim, tgt_n_vocab, max_len)

        enc_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1),
                                     None, nn.Linear(dim, dim), 0.1)
        self.encoder = Encoder(enc_layer, n_encod_layer)

        dec_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1),
                                     MultiHeadAttention(6, dim, 0.1),
                                     nn.Linear(dim, dim), 0.1)
        self.decoder = Decoder(dec_layer, n_decode_layer)

        self.encoder_decoder = EncoderDecoder(self.encoder, self.decoder,
                                              self.src_emb, self.tgt_emb)
Exemplo n.º 8
0
    def __init__(self,
                 local_rank,
                 vocab,
                 embed_dim,
                 ff_embed_dim,
                 num_heads,
                 dropout,
                 layers,
                 smoothing_factor,
                 approx=None):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim,
                                 ff_embed_dim,
                                 num_heads,
                                 dropout,
                                 with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size,
                                        self.vocab.padding_idx,
                                        smoothing_factor)

        self.dropout = dropout
        self.device = local_rank

        self.approx = approx
        self.reset_parameters()
Exemplo n.º 9
0
    def __init__(self,
                 local_rank,
                 input_dim=768,
                 ff_dim=2048,
                 num_heads=8,
                 dropout=0.2,
                 layers=6):
        super(PrefixPredict, self).__init__()
        self.input_dim = input_dim

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(input_dim, ff_dim, num_heads, dropout))
        self.one_more = nn.Linear(input_dim, input_dim)
        self.one_more_layer_norm = LayerNorm(input_dim)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.loss_fun = ContrativeLoss(device=local_rank)
        self.dropout = dropout
        self.device = local_rank
        self.reset_parameters()