예제 #1
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx):
        super(BERTLM, self).__init__()
        self.vocab = vocab
        self.embed_dim =embed_dim
        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        self.seg_embed = Embedding(2, embed_dim, None)

        self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size))

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) 
        self.nxt_snt_pred = nn.Linear(embed_dim, 1)
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
예제 #2
0
 def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim,
              num_heads, dropout, num_layers):
     super(Ranker, self).__init__()
     self.transformer_src = nn.ModuleList()
     self.transformer_tgt = nn.ModuleList()
     for i in range(num_layers):
         self.transformer_src.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
         self.transformer_tgt.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
     self.embed_dim = embed_dim
     self.embed_scale = math.sqrt(embed_dim)
     self.embed_positions = SinusoidalPositionalEmbedding(embed_dim)
     self.embed_src = Embedding(vocab_src.size, embed_dim,
                                vocab_src.padding_idx)
     self.embed_tgt = Embedding(vocab_tgt.size, embed_dim,
                                vocab_tgt.padding_idx)
     self.absorber_src = Parameter(torch.Tensor(embed_dim))
     self.absorber_tgt = Parameter(torch.Tensor(embed_dim))
     self.attention_src = MultiheadAttention(embed_dim,
                                             1,
                                             dropout,
                                             weights_dropout=False)
     self.attention_tgt = MultiheadAttention(embed_dim,
                                             1,
                                             dropout,
                                             weights_dropout=False)
     self.scorer = nn.Linear(embed_dim, embed_dim)
     self.dropout = dropout
     self.vocab_src = vocab_src
     self.vocab_tgt = vocab_tgt
     self.reset_parameters()
예제 #3
0
파일: bert.py 프로젝트: liyc7711/BERT
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads,
                 dropout, layers):
        super(BERTLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim
        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)
        self.seg_embed = Embedding(2, embed_dim, None)

        self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size))

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
        self.emb_layer_norm = nn.LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = nn.LayerNorm(embed_dim)
        self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim)
        self.nxt_snt_pred = nn.Linear(embed_dim, 1)
        self.dropout = dropout
        self.device = local_rank
        self.reset_parameters()
예제 #4
0
 def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim,
              num_heads, dropout, num_layers):
     super(Masker, self).__init__()
     self.transformer_src = nn.ModuleList()
     self.transformer_tgt = nn.ModuleList()
     for i in range(num_layers):
         self.transformer_src.append(
             TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
         self.transformer_tgt.append(
             TransformerLayer(embed_dim,
                              ff_embed_dim,
                              num_heads,
                              dropout,
                              with_external=True))
     self.embed_dim = embed_dim
     self.embed_scale = math.sqrt(embed_dim)
     self.embed_positions = SinusoidalPositionalEmbedding(embed_dim)
     self.embed_src = Embedding(vocab_src.size, embed_dim,
                                vocab_src.padding_idx)
     self.embed_tgt = Embedding(vocab_tgt.size, embed_dim,
                                vocab_tgt.padding_idx)
     self.masker = nn.Linear(embed_dim, 1)
     self.dropout = dropout
     self.vocab_src = vocab_src
     self.vocab_tgt = vocab_tgt
     self.reset_parameters()
예제 #5
0
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout):
    inp = Input((seq_len, ))
    embedding = Embedding(vocab_size, d_model, pad_id)(inp)
    encoding = PositionalEncoding(d_model)(inp)
    net = Add()([embedding, encoding])
    net = Dropout(dropout)(net)
    mask = Lambda(lambda t: create_padding_mask(t, pad_id),
                  name="input_mask")(inp)
    net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h,
                  dropout=dropout)([net, mask])
    net = Flatten()(net)
    net = Dense(2, activation="softmax")(net)

    model = Model(inp, net)

    # NOTE: keras optimizers cannot be saved with optimizer state
    # need to use an optimizer from `tf.train`
    # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are
    # dropped and the keras versions are the only implementations
    # NOTE: this is not recommended for training, the paper authors describe
    # a variable learning rate schedule, that still needs to be implemented.
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
                                       beta1=0.9,
                                       beta2=0.98,
                                       epsilon=1e-9)

    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["acc"])

    return model
예제 #6
0
    def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout,
                 mem_dropout, enc_layers, dec_layers, mem_enc_layers,
                 label_smoothing, use_mem_score):
        super(MemGenerator, self).__init__()
        self.vocabs = vocabs

        self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim,
                                   ff_embed_dim, num_heads, dropout)

        self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim,
                                   vocabs['tgt'].padding_idx)
        self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim)
        self.decoder = Transformer(dec_layers,
                                   embed_dim,
                                   ff_embed_dim,
                                   num_heads,
                                   dropout,
                                   with_external=True)

        self.mem_encoder = MonoEncoder(vocabs['tgt'], mem_enc_layers,
                                       embed_dim, ff_embed_dim, num_heads,
                                       mem_dropout)

        self.embed_scale = math.sqrt(embed_dim)
        self.self_attn_mask = SelfAttentionMask()
        self.output = CopyTokenDecoder(vocabs, self.tgt_embed, label_smoothing,
                                       embed_dim, ff_embed_dim, dropout)
        self.dropout = dropout
        if use_mem_score:
            self.mem_bias_scale = nn.Parameter(torch.ones(1))
            self.mem_bias_base = nn.Parameter(torch.zeros(1))
        self.use_mem_score = use_mem_score
예제 #7
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        
        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)
        
        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor)
       
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
예제 #8
0
def AMREmbedding(vocab,
                 embedding_dim,
                 pretrained_file=None,
                 amr=False,
                 dump_file=None):
    if pretrained_file is None:
        return Embedding(vocab.size, embedding_dim, vocab.padding_idx)

    tokens_to_keep = set()
    for idx in range(vocab.size):
        token = vocab.idx2token(idx)
        if amr:
            token = re.sub(r'-\d\d$', '', token)
        tokens_to_keep.add(token)

    embeddings = {}

    if dump_file is not None:
        fo = open(dump_file, 'w', encoding='utf8')

    with open(pretrained_file, encoding='utf-8') as embeddings_file:
        for line in embeddings_file.readlines():
            fields = line.rstrip().split(' ')
            if len(fields) - 1 != embedding_dim:
                continue

            token = fields[0]
            if token in tokens_to_keep:
                if dump_file is not None:
                    fo.write(line)
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
        print('glove_initiate')
    if dump_file is not None:
        fo.close()

    all_embeddings = np.asarray(list(embeddings.values()))
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))
    # 일단 random으로 initialize 한 다음에 write
    embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)

    for i in range(vocab.size):
        token = vocab.idx2token(i)
        # 단어가 없으면 그냥 랜덤으로 생성된거 쓸거다.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
        else:
            if amr:
                normalized_token = re.sub(r'-\d\d$', '', token)
                if normalized_token in embeddings:
                    embedding_matrix[i] = torch.FloatTensor(
                        embeddings[normalized_token])
    embedding_matrix[vocab.padding_idx].fill_(0.)

    return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
예제 #9
0
def GraphEmbedding(vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None):
    if pretrained_file is None:
        return Embedding(vocab.size, embedding_dim, vocab.padding_idx)

    tokens_to_keep = set()
    for idx in range(vocab.size):
        token = vocab.idx2token(idx)
        # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param.
        if amr:
            token = re.sub(r'-\d\d$', '', token)
        tokens_to_keep.add(token)

    embeddings = {}
 
    if dump_file is not None:
        fo = open(dump_file, 'w', encoding='utf8')

    with open(pretrained_file, encoding='utf8') as embeddings_file:
        for line in embeddings_file.readlines():    
            fields = line.rstrip().split(' ')
            if len(fields) - 1 != embedding_dim:
                continue
            token = fields[0]
            if token in tokens_to_keep:
                if dump_file is not None:
                    fo.write(line)
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if dump_file is not None:
        fo.close()

    all_embeddings = np.asarray(list(embeddings.values()))
    print ('pretrained', all_embeddings.shape)
    embeddings_mean = float(np.mean(all_embeddings))
    embeddings_std = float(np.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)

    for i in range(vocab.size):
        token = vocab.idx2token(i)

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
        else:
            if amr:
                normalized_token = re.sub(r'-\d\d$', '', token)
                if normalized_token in embeddings:
                    embedding_matrix[i] = torch.FloatTensor(embeddings[normalized_token])
    embedding_matrix[vocab.padding_idx].fill_(0.)

    return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
예제 #10
0
    def __init__(self,
                 vocabs,
                 char_dim,
                 word_dim,
                 pos_dim,
                 ner_dim,
                 embed_dim,
                 filters,
                 char2word_dim,
                 dropout,
                 pretrained_word_embed=None):
        super(WordEncoder, self).__init__()
        self.char_embed = Embedding(
            vocabs['word_char'].size,
            char_dim,
            padding_idx=vocabs['word_char'].padding_idx)
        self.char2word = CNNEncoder(filters, char_dim, char2word_dim)
        self.lem_embed = Embedding(vocabs['lem'].size,
                                   word_dim,
                                   padding_idx=vocabs['lem'].padding_idx)

        if pos_dim > 0:
            self.pos_embed = Embedding(vocabs['pos'].size,
                                       pos_dim,
                                       padding_idx=vocabs['pos'].padding_idx)
        else:
            self.pos_embed = None
        if ner_dim > 0:
            self.ner_embed = Embedding(vocabs['ner'].size,
                                       ner_dim,
                                       padding_idx=vocabs['ner'].padding_idx)
        else:
            self.ner_embed = None

        tot_dim = word_dim + pos_dim + ner_dim + char2word_dim

        self.pretrained_word_embed = pretrained_word_embed
        if self.pretrained_word_embed is not None:
            tot_dim += self.pretrained_word_embed.embedding_dim

        self.out_proj = nn.Linear(tot_dim, embed_dim)
        self.dropout = dropout
        self.reset_parameters()
예제 #11
0
 def __init__(self, vocab, layers, embed_dim, ff_embed_dim, num_heads,
              dropout):
     super(MonoEncoder, self).__init__()
     self.vocab = vocab
     self.src_embed = Embedding(vocab.size, embed_dim, vocab.padding_idx)
     self.src_pos_embed = SinusoidalPositionalEmbedding(embed_dim)
     self.embed_scale = math.sqrt(embed_dim)
     self.transformer = Transformer(layers, embed_dim, ff_embed_dim,
                                    num_heads, dropout)
     self.dropout = dropout
예제 #12
0
    def __init__(self, vocabs, char_dim, concept_dim, embed_dim, filters,
                 char2concept_dim, dropout):
        super(ConceptEncoder, self).__init__()
        self.char_embed = Embedding(
            vocabs['concept_char'].size,
            char_dim,
            padding_idx=vocabs['concept_char'].padding_idx)
        self.concept_embed = Embedding(
            vocabs['concept'].size,
            concept_dim,
            padding_idx=vocabs['concept'].padding_idx)
        self.char2concept = CNNEncoder(filters, char_dim, char2concept_dim)
        self.vocabs = vocabs
        tot_dim = char2concept_dim + concept_dim
        self.out_proj = nn.Linear(tot_dim, embed_dim)
        self.char_dim = char_dim
        self.concept_dim = concept_dim
        self.dropout = dropout

        self.reset_parameters()
예제 #13
0
파일: biglm.py 프로젝트: xinyu12138/SongNet
    def __init__(self,
                 local_rank,
                 vocab,
                 embed_dim,
                 ff_embed_dim,
                 num_heads,
                 dropout,
                 layers,
                 smoothing_factor,
                 approx=None):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim,
                                 ff_embed_dim,
                                 num_heads,
                                 dropout,
                                 with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size,
                                        self.vocab.padding_idx,
                                        smoothing_factor)

        self.dropout = dropout
        self.device = local_rank

        self.approx = approx
        self.reset_parameters()
예제 #14
0
    def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout,
                 enc_layers, dec_layers, label_smoothing):
        super(Generator, self).__init__()
        self.vocabs = vocabs

        self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim,
                                   ff_embed_dim, num_heads, dropout)

        self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim,
                                   vocabs['tgt'].padding_idx)
        self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim)
        self.decoder = Transformer(dec_layers,
                                   embed_dim,
                                   ff_embed_dim,
                                   num_heads,
                                   dropout,
                                   with_external=True)

        self.embed_scale = math.sqrt(embed_dim)
        self.self_attn_mask = SelfAttentionMask()
        self.output = TokenDecoder(vocabs, self.tgt_embed, label_smoothing)
        self.dropout = dropout
예제 #15
0
    def __init__(self, vocabs, retriever, share_encoder, embed_dim,
                 ff_embed_dim, num_heads, dropout, mem_dropout, enc_layers,
                 dec_layers, mem_enc_layers, label_smoothing):
        super(RetrieverGenerator, self).__init__()
        self.vocabs = vocabs

        ####Retriever####
        self.share_encoder = share_encoder
        self.retriever = retriever
        self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim,
                                   ff_embed_dim, num_heads, dropout)
        ####Retriever####

        self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim,
                                   vocabs['tgt'].padding_idx)
        self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim)
        self.decoder = Transformer(dec_layers,
                                   embed_dim,
                                   ff_embed_dim,
                                   num_heads,
                                   dropout,
                                   with_external=True)

        if share_encoder:
            self.mem_encoder = self.retriever.mem_feat_or_feat_maker.encoder
        else:
            self.mem_encoder = MonoEncoder(vocabs['tgt'], mem_enc_layers,
                                           embed_dim, ff_embed_dim, num_heads,
                                           mem_dropout)

        self.embed_scale = math.sqrt(embed_dim)
        self.self_attn_mask = SelfAttentionMask()
        self.output = CopyTokenDecoder(vocabs, self.tgt_embed, label_smoothing,
                                       embed_dim, ff_embed_dim, dropout)
        self.mem_bias_scale = nn.Parameter(torch.ones(retriever.num_heads))
        self.mem_bias_base = nn.Parameter(torch.zeros(retriever.num_heads))
        self.dropout = dropout