def test_encoder_layer(self): embed_dim, n_heads, dropout_rate = 512, 8, 0.5 encoder_layer = EncoderLayer(embed_dim, embed_dim // 2, n_heads, dropout_rate) batch_size, max_seq_len = 5, 10 x = torch.randn(batch_size, max_seq_len, embed_dim) assert encoder_layer(x).shape == x.shape
def __init__(self, num_classes=3): """We have some of the best constructors in the world""" super(TransformerClassifier, self).__init__() tokenizer = CharBPETokenizer( '../Tokenize/thyme-tokenizer-vocab.json', '../Tokenize/thyme-tokenizer-merges.txt') vocab_size = tokenizer.get_vocab_size() self.embedding = nn.Embedding( num_embeddings=vocab_size, embedding_dim=cfg.getint('model', 'emb_dim')) self.position = PositionalEncoding( embedding_dim=cfg.getint('model', 'emb_dim')) trans_encoders = [] for n in range(cfg.getint('model', 'num_layers')): trans_encoders.append(EncoderLayer( d_model=cfg.getint('model', 'emb_dim'), d_inner=cfg.getint('model', 'feedforw_dim'), n_head=cfg.getint('model', 'num_heads'), d_k=cfg.getint('model', 'emb_dim'), d_v=cfg.getint('model', 'emb_dim'))) self.trans_encoders = nn.ModuleList(trans_encoders) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.linear = nn.Linear( in_features=cfg.getint('model', 'emb_dim'), out_features=num_classes) self.init_weights()
def __init__(self, num_classes=2): """We have some of the best constructors in the world""" super(TransformerClassifier, self).__init__() self.embed = nn.Embedding( num_embeddings=cfg.getint('data', 'vocab_size'), embedding_dim=cfg.getint('model', 'emb_dim')) trans_encoders = [] for n in range(cfg.getint('model', 'n_layers')): trans_encoders.append(EncoderLayer( d_model=cfg.getint('model', 'emb_dim'), d_inner=cfg.getint('model', 'feedforw_dim'), n_head=cfg.getint('model', 'n_heads'), d_k=cfg.getint('model', 'emb_dim'), d_v=cfg.getint('model', 'emb_dim'))) self.trans_encoders = nn.ModuleList(trans_encoders) self.dropout = nn.Dropout(cfg.getfloat('model', 'dropout')) self.linear = nn.Linear( in_features=cfg.getint('model', 'emb_dim'), out_features=num_classes) self.init_weights()
def __init__(self, vocab_size, embed_model=None, emb_size=100, hidden_size=128, \ input_dropout_p=0, dropout_p=0, n_layers=1, bidirectional=False, \ rnn_cell=None, rnn_cell_name='gru', variable_lengths=True,d_ff=2048,dropout=0.3,N=1): super(EncoderRNN, self).__init__(vocab_size, emb_size, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell_name) self.variable_lengths = variable_lengths self.bidirectional = bidirectional if bidirectional: self.d_model = 2 * hidden_size else: self.d_model = hidden_size ff = PositionwiseFeedForward(self.d_model, d_ff, dropout) if embed_model is None: self.embedding = nn.Embedding(vocab_size, emb_size) else: self.embedding = embed_model if rnn_cell is None: self.rnn = self.rnn_cell(emb_size, hidden_size, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout_p) else: self.rnn = rnn_cell self.group_attention = GroupAttention(8, self.d_model) self.onelayer = Encoder( EncoderLayer(self.d_model, deepcopy(self.group_attention), deepcopy(ff), dropout), N)
def __init__(self, input_vocab_size, output_vocab_size, d_model, d_inner, n_layers, n_head, d_k, d_v, dropout, max_len, save_config=True): """Constructor""" super(TransformerEncoder, self).__init__() self.embed = nn.Embedding(num_embeddings=input_vocab_size, embedding_dim=d_model) trans_encoders = [] for n in range(n_layers): trans_encoders.append( EncoderLayer(d_model=d_model, d_inner=d_inner, n_head=n_head, d_k=d_k, d_v=d_v)) self.trans_encoders = nn.ModuleList(trans_encoders) self.dropout = nn.Dropout(dropout) self.classifier = nn.Linear(in_features=d_model, out_features=output_vocab_size) # save configuration for loading later if save_config: config = dict(input_vocab_size=input_vocab_size, output_vocab_size=output_vocab_size, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout, max_len=max_len) pickle_file = open(config_path, 'wb') pickle.dump(config, pickle_file) self.init_weights()
def __init__(self, n_bins, ip_bin_size, hm, args): super(TransformerEnc, self).__init__() self.model_n_dim = args.bin_rnn_size self.attn = MultiAttn(args.num_heads, self.model_n_dim) self.ff = FeedForward(self.model_n_dim, args.dff, args.dropout) self.posit = Position(self.model_n_dim, args.dropout, n_bins) self.enc = EncoderLayer(self.model_n_dim, self.attn, self.ff, args.dropout) self.transformer = TransfromerEncoder(args.num_t, self.enc) self.linear = nn.Linear(hm, self.model_n_dim) self.pooler = Pooler(args.bin_rnn_size) self.norm = None if args.norm is not None: self.norm = Norm(hm)
def make_model_elmo(N=6, d_model=1024, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embedder(), c(position)), nn.Sequential(Embedder(), c(position)), generator=None) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def BuildModel(vocab_size, encoder_emb, decoder_emb, d_model = 512, N = 6, d_ff = 2048, h = 8, dropout = 0.1): target_vocab = vocab_size c = copy.deepcopy attention = MultiHeadedAttention(h, d_model) feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) encoder_layer = EncoderLayer(d_model, c(attention), c(feed_forward), dropout) decoder_layer = DecoderLayer(d_model, c(attention), c(attention), c(feed_forward), dropout) encoder = Encoder(encoder_layer, N) decoder = Decoder(decoder_layer, N) model = EncoderDecoder( encoder, decoder, nn.Sequential(Embeddings(encoder_emb, d_model), c(position)), nn.Sequential(Embeddings(decoder_emb, d_model), c(position)), Generator(d_model, target_vocab)) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
# y = Dropout(0.1)(x) # y = Dense(500, activation='relu')(x) # y = Dropout(0.2)(y) # y = Dense(500, activation='relu')(y) # y = Dropout(0.2)(y) # y = Dense(500, activation='relu')(y) # y = Dropout(0.3)(y) # d_model = 1 d_inner_hid = opt.d_inner_hid #1 # d_inner_hid = 512 n_head = opt.n_head # 1 # n_head = 3 d_k = opt.d_k # 1 #64 d_v = opt.d_v # 1 #64 layers = opt.layers # 1 dropout_rate = 0.1 encodeLayerList = [ EncoderLayer(1, d_inner_hid, n_head, d_k, d_v, dropout_rate) for _ in range(layers) ] y = None for enc_layer in encodeLayerList: if y is None: y, _ = enc_layer(x) else: y, _ = enc_layer(y) y_2dim = Reshape([int(y.shape[1])])(y) # y_2dim = Reshape([int(x.shape[1])])(x) out = Dense(nb_classes, activation='softmax')(y_2dim) model = Model(input=x, output=out)
def __init__(self, input_vocab, target_vocab, d_model=512, d_int=2048, d_k=64, h=8, n_layers=6, dropout_rate=0.1, max_len_pe=200, bert_name=None): """ :param input_vocab: Vocab based on BERT tokenizer :param target_vocab: Vocab based on BERT tokenizer, requires embedding. Fields tokenizer, tokenizer.ids_to_tokens = ordered_dict pad=0, start=1, end=2 :param size: Size of the BERT model: base or large :param d_model: dimension of transformer embeddings #TODO add linear layer to map BERT output to dim 512? :param dropout_rate:dropout, default 0.1 """ super(TSP, self).__init__() self.dropout_rate = dropout_rate self.input_vocab = input_vocab self.target_vocab = target_vocab self.model_embeddings_source = nn.Sequential( DecoderEmbeddings(vocab=self.input_vocab, embed_size=d_model), PositionalEncoding(d_model=d_model, dropout=dropout_rate, max_len=max_len_pe)) self.model_embeddings_target = nn.Sequential( DecoderEmbeddings(vocab=self.target_vocab, embed_size=d_model), PositionalEncoding(d_model=d_model, dropout=dropout_rate, max_len=max_len_pe)) self.encoder = TransformerEncoder(layer=EncoderLayer( d_model=d_model, d_int=d_int, d_k=d_k, d_v=d_k, h=h, p_drop=dropout_rate), n_layer=n_layers) self.decoder = Transformer(layer=DecoderLayer(d_model=d_model, d_int=d_int, d_k=d_k, d_v=d_k, h=h, p_drop=dropout_rate), n_layer=n_layers) self.linear_projection = nn.Linear( d_model, len(self.target_vocab.tokenizer.ids_to_tokens), bias=False) self.dropout = nn.Dropout(self.dropout_rate) self.device = self.linear_projection.weight.device initialize_weights(self.encoder) initialize_weights(self.decoder) initialize_weights(self.linear_projection) initialize_weights(self.model_embeddings_source) initialize_weights(self.model_embeddings_target)