def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx): super(BERTLM, self).__init__() self.vocab = vocab self.embed_dim =embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.seg_embed = Embedding(2, embed_dim, None) self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size)) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) self.nxt_snt_pred = nn.Linear(embed_dim, 1) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim, num_heads, dropout, num_layers): super(Ranker, self).__init__() self.transformer_src = nn.ModuleList() self.transformer_tgt = nn.ModuleList() for i in range(num_layers): self.transformer_src.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.transformer_tgt.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.embed_dim = embed_dim self.embed_scale = math.sqrt(embed_dim) self.embed_positions = SinusoidalPositionalEmbedding(embed_dim) self.embed_src = Embedding(vocab_src.size, embed_dim, vocab_src.padding_idx) self.embed_tgt = Embedding(vocab_tgt.size, embed_dim, vocab_tgt.padding_idx) self.absorber_src = Parameter(torch.Tensor(embed_dim)) self.absorber_tgt = Parameter(torch.Tensor(embed_dim)) self.attention_src = MultiheadAttention(embed_dim, 1, dropout, weights_dropout=False) self.attention_tgt = MultiheadAttention(embed_dim, 1, dropout, weights_dropout=False) self.scorer = nn.Linear(embed_dim, embed_dim) self.dropout = dropout self.vocab_src = vocab_src self.vocab_tgt = vocab_tgt self.reset_parameters()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers): super(BERTLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.seg_embed = Embedding(2, embed_dim, None) self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size)) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.emb_layer_norm = nn.LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = nn.LayerNorm(embed_dim) self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) self.nxt_snt_pred = nn.Linear(embed_dim, 1) self.dropout = dropout self.device = local_rank self.reset_parameters()
def __init__(self, vocab_src, vocab_tgt, embed_dim, ff_embed_dim, num_heads, dropout, num_layers): super(Masker, self).__init__() self.transformer_src = nn.ModuleList() self.transformer_tgt = nn.ModuleList() for i in range(num_layers): self.transformer_src.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.transformer_tgt.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.embed_dim = embed_dim self.embed_scale = math.sqrt(embed_dim) self.embed_positions = SinusoidalPositionalEmbedding(embed_dim) self.embed_src = Embedding(vocab_src.size, embed_dim, vocab_src.padding_idx) self.embed_tgt = Embedding(vocab_tgt.size, embed_dim, vocab_tgt.padding_idx) self.masker = nn.Linear(embed_dim, 1) self.dropout = dropout self.vocab_src = vocab_src self.vocab_tgt = vocab_tgt self.reset_parameters()
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout): inp = Input((seq_len, )) embedding = Embedding(vocab_size, d_model, pad_id)(inp) encoding = PositionalEncoding(d_model)(inp) net = Add()([embedding, encoding]) net = Dropout(dropout)(net) mask = Lambda(lambda t: create_padding_mask(t, pad_id), name="input_mask")(inp) net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout)([net, mask]) net = Flatten()(net) net = Dense(2, activation="softmax")(net) model = Model(inp, net) # NOTE: keras optimizers cannot be saved with optimizer state # need to use an optimizer from `tf.train` # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are # dropped and the keras versions are the only implementations # NOTE: this is not recommended for training, the paper authors describe # a variable learning rate schedule, that still needs to be implemented. optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) return model
def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout, mem_dropout, enc_layers, dec_layers, mem_enc_layers, label_smoothing, use_mem_score): super(MemGenerator, self).__init__() self.vocabs = vocabs self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim, ff_embed_dim, num_heads, dropout) self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim, vocabs['tgt'].padding_idx) self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim) self.decoder = Transformer(dec_layers, embed_dim, ff_embed_dim, num_heads, dropout, with_external=True) self.mem_encoder = MonoEncoder(vocabs['tgt'], mem_enc_layers, embed_dim, ff_embed_dim, num_heads, mem_dropout) self.embed_scale = math.sqrt(embed_dim) self.self_attn_mask = SelfAttentionMask() self.output = CopyTokenDecoder(vocabs, self.tgt_embed, label_smoothing, embed_dim, ff_embed_dim, dropout) self.dropout = dropout if use_mem_score: self.mem_bias_scale = nn.Parameter(torch.ones(1)) self.mem_bias_base = nn.Parameter(torch.zeros(1)) self.use_mem_score = use_mem_score
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def AMREmbedding(vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None): if pretrained_file is None: return Embedding(vocab.size, embedding_dim, vocab.padding_idx) tokens_to_keep = set() for idx in range(vocab.size): token = vocab.idx2token(idx) if amr: token = re.sub(r'-\d\d$', '', token) tokens_to_keep.add(token) embeddings = {} if dump_file is not None: fo = open(dump_file, 'w', encoding='utf8') with open(pretrained_file, encoding='utf-8') as embeddings_file: for line in embeddings_file.readlines(): fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: continue token = fields[0] if token in tokens_to_keep: if dump_file is not None: fo.write(line) vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector print('glove_initiate') if dump_file is not None: fo.close() all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) # 일단 random으로 initialize 한 다음에 write embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_( embeddings_mean, embeddings_std) for i in range(vocab.size): token = vocab.idx2token(i) # 단어가 없으면 그냥 랜덤으로 생성된거 쓸거다. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) else: if amr: normalized_token = re.sub(r'-\d\d$', '', token) if normalized_token in embeddings: embedding_matrix[i] = torch.FloatTensor( embeddings[normalized_token]) embedding_matrix[vocab.padding_idx].fill_(0.) return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
def GraphEmbedding(vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None): if pretrained_file is None: return Embedding(vocab.size, embedding_dim, vocab.padding_idx) tokens_to_keep = set() for idx in range(vocab.size): token = vocab.idx2token(idx) # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param. if amr: token = re.sub(r'-\d\d$', '', token) tokens_to_keep.add(token) embeddings = {} if dump_file is not None: fo = open(dump_file, 'w', encoding='utf8') with open(pretrained_file, encoding='utf8') as embeddings_file: for line in embeddings_file.readlines(): fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: continue token = fields[0] if token in tokens_to_keep: if dump_file is not None: fo.write(line) vector = np.asarray(fields[1:], dtype='float32') embeddings[token] = vector if dump_file is not None: fo.close() all_embeddings = np.asarray(list(embeddings.values())) print ('pretrained', all_embeddings.shape) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. embedding_matrix = torch.FloatTensor(vocab.size, embedding_dim).normal_(embeddings_mean, embeddings_std) for i in range(vocab.size): token = vocab.idx2token(i) # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) else: if amr: normalized_token = re.sub(r'-\d\d$', '', token) if normalized_token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[normalized_token]) embedding_matrix[vocab.padding_idx].fill_(0.) return nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
def __init__(self, vocabs, char_dim, word_dim, pos_dim, ner_dim, embed_dim, filters, char2word_dim, dropout, pretrained_word_embed=None): super(WordEncoder, self).__init__() self.char_embed = Embedding( vocabs['word_char'].size, char_dim, padding_idx=vocabs['word_char'].padding_idx) self.char2word = CNNEncoder(filters, char_dim, char2word_dim) self.lem_embed = Embedding(vocabs['lem'].size, word_dim, padding_idx=vocabs['lem'].padding_idx) if pos_dim > 0: self.pos_embed = Embedding(vocabs['pos'].size, pos_dim, padding_idx=vocabs['pos'].padding_idx) else: self.pos_embed = None if ner_dim > 0: self.ner_embed = Embedding(vocabs['ner'].size, ner_dim, padding_idx=vocabs['ner'].padding_idx) else: self.ner_embed = None tot_dim = word_dim + pos_dim + ner_dim + char2word_dim self.pretrained_word_embed = pretrained_word_embed if self.pretrained_word_embed is not None: tot_dim += self.pretrained_word_embed.embedding_dim self.out_proj = nn.Linear(tot_dim, embed_dim) self.dropout = dropout self.reset_parameters()
def __init__(self, vocab, layers, embed_dim, ff_embed_dim, num_heads, dropout): super(MonoEncoder, self).__init__() self.vocab = vocab self.src_embed = Embedding(vocab.size, embed_dim, vocab.padding_idx) self.src_pos_embed = SinusoidalPositionalEmbedding(embed_dim) self.embed_scale = math.sqrt(embed_dim) self.transformer = Transformer(layers, embed_dim, ff_embed_dim, num_heads, dropout) self.dropout = dropout
def __init__(self, vocabs, char_dim, concept_dim, embed_dim, filters, char2concept_dim, dropout): super(ConceptEncoder, self).__init__() self.char_embed = Embedding( vocabs['concept_char'].size, char_dim, padding_idx=vocabs['concept_char'].padding_idx) self.concept_embed = Embedding( vocabs['concept'].size, concept_dim, padding_idx=vocabs['concept'].padding_idx) self.char2concept = CNNEncoder(filters, char_dim, char2concept_dim) self.vocabs = vocabs tot_dim = char2concept_dim + concept_dim self.out_proj = nn.Linear(tot_dim, embed_dim) self.char_dim = char_dim self.concept_dim = concept_dim self.dropout = dropout self.reset_parameters()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx=None): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank self.approx = approx self.reset_parameters()
def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout, enc_layers, dec_layers, label_smoothing): super(Generator, self).__init__() self.vocabs = vocabs self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim, ff_embed_dim, num_heads, dropout) self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim, vocabs['tgt'].padding_idx) self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim) self.decoder = Transformer(dec_layers, embed_dim, ff_embed_dim, num_heads, dropout, with_external=True) self.embed_scale = math.sqrt(embed_dim) self.self_attn_mask = SelfAttentionMask() self.output = TokenDecoder(vocabs, self.tgt_embed, label_smoothing) self.dropout = dropout
def __init__(self, vocabs, retriever, share_encoder, embed_dim, ff_embed_dim, num_heads, dropout, mem_dropout, enc_layers, dec_layers, mem_enc_layers, label_smoothing): super(RetrieverGenerator, self).__init__() self.vocabs = vocabs ####Retriever#### self.share_encoder = share_encoder self.retriever = retriever self.encoder = MonoEncoder(vocabs['src'], enc_layers, embed_dim, ff_embed_dim, num_heads, dropout) ####Retriever#### self.tgt_embed = Embedding(vocabs['tgt'].size, embed_dim, vocabs['tgt'].padding_idx) self.tgt_pos_embed = SinusoidalPositionalEmbedding(embed_dim) self.decoder = Transformer(dec_layers, embed_dim, ff_embed_dim, num_heads, dropout, with_external=True) if share_encoder: self.mem_encoder = self.retriever.mem_feat_or_feat_maker.encoder else: self.mem_encoder = MonoEncoder(vocabs['tgt'], mem_enc_layers, embed_dim, ff_embed_dim, num_heads, mem_dropout) self.embed_scale = math.sqrt(embed_dim) self.self_attn_mask = SelfAttentionMask() self.output = CopyTokenDecoder(vocabs, self.tgt_embed, label_smoothing, embed_dim, ff_embed_dim, dropout) self.mem_bias_scale = nn.Parameter(torch.ones(retriever.num_heads)) self.mem_bias_base = nn.Parameter(torch.zeros(retriever.num_heads)) self.dropout = dropout