def __init__(self, number_time_series: int, seq_length: int = 48, output_seq_len: int = None, d_model: int = 128, n_heads: int = 8, dropout=0.1, forward_dim=2048, sigmoid=False): """ Full transformer model """ super().__init__() if output_seq_len is None: output_seq_len = seq_length self.out_seq_len = output_seq_len self.mask = generate_square_subsequent_mask(seq_length) self.dense_shape = torch.nn.Linear(number_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) self.transformer = Transformer(d_model, nhead=n_heads) self.final_layer = torch.nn.Linear(d_model, 1) self.sequence_size = seq_length self.tgt_mask = generate_square_subsequent_mask(output_seq_len) self.sigmoid = None if sigmoid: self.sigmoid = torch.nn.Sigmoid()
class SimpleTransformer(torch.nn.Module): def __init__( self, number_time_series: int, seq_length: int = 48, output_seq_len: int = None, d_model: int = 128, n_heads: int = 8, dropout=0.1, forward_dim=2048, sigmoid=False): """ Full transformer model """ super().__init__() if output_seq_len is None: output_seq_len = seq_length self.out_seq_len = output_seq_len self.mask = generate_square_subsequent_mask(seq_length) self.dense_shape = torch.nn.Linear(number_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) self.transformer = Transformer(d_model, nhead=n_heads) self.final_layer = torch.nn.Linear(d_model, 1) self.sequence_size = seq_length self.tgt_mask = generate_square_subsequent_mask(output_seq_len) self.sigmoid = None if sigmoid: self.sigmoid = torch.nn.Sigmoid() def forward(self, x: torch.Tensor, t: torch.Tensor, tgt_mask=None, src_mask=None): if src_mask: x = self.encode_sequence(x, src_mask) else: x = self.encode_sequence(x, src_mask) return self.decode_seq(x, t, tgt_mask) def basic_feature(self, x: torch.Tensor): x = self.dense_shape(x) x = self.pe(x) x = x.permute(1, 0, 2) return x def encode_sequence(self, x, src_mask=None): x = self.basic_feature(x) x = self.transformer.encoder(x, src_mask) return x def decode_seq(self, mem, t, tgt_mask=None, view_number=None) -> torch.Tensor: if view_number is None: view_number = self.out_seq_len if tgt_mask is None: tgt_mask = self.tgt_mask t = self.basic_feature(t) x = self.transformer.decoder(t, mem, tgt_mask=tgt_mask) x = self.final_layer(x) if self.sigmoid: x = self.sigmoid(x) return x.view(-1, view_number)
def __init__(self, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, activation, src_vocab_size, tgt_vocab_size): super(TransformerModel, self).__init__() self.pos_encoder = PositionalEncoding( d_model=d_model, dropout=0.1) # , max_len=100) encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder( encoder_layer, num_encoder_layers, encoder_norm) decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder( decoder_layer, num_decoder_layers, decoder_norm) self.d_model = d_model self.nhead = nhead self.linear = Linear(d_model, tgt_vocab_size) self.transformer = Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation) self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model) self._reset_parameters()
def __init__(self, number_time_series: int, seq_length: int = 48, output_seq_len: int = None, d_model: int = 128, n_heads: int = 8, dropout=0.1, forward_dim=2048, sigmoid=False): """A full transformer model :param number_time_series: The total number of time series present (e.g. n_feature_time_series + n_targets) :type number_time_series: int :param seq_length: The length of your input sequence, defaults to 48 :type seq_length: int, optional :param output_seq_len: The length of your output sequence, defaults to None :type output_seq_len: int, optional :param d_model: The dimensions of your model, defaults to 128 :type d_model: int, optional :param n_heads: The number of heads in each encoder/decoder block, defaults to 8 :type n_heads: int, optional :param dropout: The fraction of dropout you wish to apply during training, defaults to 0.1 (currently not functional) :type dropout: float, optional :param forward_dim: Currently not functional, defaults to 2048 :type forward_dim: int, optional :param sigmoid: Whether to apply a sigmoid activation to the final layer (useful for binary classification), defaults to False :type sigmoid: bool, optional """ super().__init__() if output_seq_len is None: output_seq_len = seq_length self.out_seq_len = output_seq_len self.mask = generate_square_subsequent_mask(seq_length) self.dense_shape = torch.nn.Linear(number_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) self.transformer = Transformer(d_model, nhead=n_heads) self.final_layer = torch.nn.Linear(d_model, 1) self.sequence_size = seq_length self.tgt_mask = generate_square_subsequent_mask(output_seq_len) self.sigmoid = None if sigmoid: self.sigmoid = torch.nn.Sigmoid()
def __init__(self, vocab_size, em_size, word_dropout=0.4, dropout=0.1): super(SynPG, self).__init__() self.vocab_size = vocab_size self.em_size = em_size self.word_dropout = word_dropout self.dropout = dropout # vcocabulary embedding self.embedding_encoder = nn.Embedding(vocab_size, em_size) self.embedding_decoder = nn.Embedding(vocab_size, em_size) # positional encoding self.pos_encoder = PositionalEncoding(em_size, dropout=0.0) self.transformer = Transformer(d_model=em_size, nhead=6, dropout=dropout) # linear Transformation self.linear = nn.Linear(em_size, vocab_size) self.init_weights()
class SynPG(nn.Module): def __init__(self, vocab_size, em_size, word_dropout=0.4, dropout=0.1): super(SynPG, self).__init__() self.vocab_size = vocab_size self.em_size = em_size self.word_dropout = word_dropout self.dropout = dropout # vcocabulary embedding self.embedding_encoder = nn.Embedding(vocab_size, em_size) self.embedding_decoder = nn.Embedding(vocab_size, em_size) # positional encoding self.pos_encoder = PositionalEncoding(em_size, dropout=0.0) self.transformer = Transformer(d_model=em_size, nhead=6, dropout=dropout) # linear Transformation self.linear = nn.Linear(em_size, vocab_size) self.init_weights() def init_weights(self): initrange = 0.1 # initialize cocabulary matrix weight self.embedding_encoder.weight.data.uniform_(-initrange, initrange) self.embedding_decoder.weight.data.uniform_(-initrange, initrange) # initialize linear weight self.linear.weight.data.uniform_(-initrange, initrange) self.linear.bias.data.fill_(0) def load_embedding(self, embedding): self.embedding_encoder.weight.data.copy_(torch.from_numpy(embedding)) self.embedding_decoder.weight.data.copy_(torch.from_numpy(embedding)) def store_grad_norm(self, grad): norm = torch.norm(grad, 2, 1) self.grad_norm = norm.detach().data.mean() return grad def generate_square_mask(self, max_sent_len, max_synt_len): size = max_sent_len + max_synt_len + 2 mask = torch.zeros((size, size)) mask[:max_sent_len, max_sent_len:] = float("-inf") mask[max_sent_len:, :max_sent_len] = float("-inf") return mask def forward(self, sents, synts, targs): batch_size = sents.size(0) max_sent_len = sents.size(1) max_synt_len = synts.size(1) - 2 # count without <sos> and <eos> max_targ_len = targs.size(1) - 2 # count without <sos> and <eos> # apply word dropout drop_mask = torch.bernoulli(self.word_dropout * torch.ones(max_sent_len)).bool().cuda() sents = sents.masked_fill(drop_mask, 0) # sentence, syntax => embedding sent_embeddings = self.embedding_encoder(sents).transpose( 0, 1) * np.sqrt(self.em_size) synt_embeddings = self.embedding_encoder(synts).transpose( 0, 1) * np.sqrt(self.em_size) synt_embeddings = self.pos_encoder(synt_embeddings) en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0) # record gradient if en_embeddings.requires_grad: en_embeddings.register_hook(self.store_grad_norm) # do not allow cross attetion src_mask = self.generate_square_mask(max_sent_len, max_synt_len).cuda() # target => embedding de_embeddings = self.embedding_decoder(targs[:, :-1]).transpose( 0, 1) * np.sqrt(self.em_size) de_embeddings = self.pos_encoder(de_embeddings) # sequential mask tgt_mask = self.transformer.generate_square_subsequent_mask( max_targ_len + 1).cuda() # forward outputs = self.transformer(en_embeddings, de_embeddings, src_mask=src_mask, tgt_mask=tgt_mask) # apply linear layer to vcocabulary size outputs = outputs.transpose(0, 1) outputs = self.linear(outputs.contiguous().view(-1, self.em_size)) outputs = outputs.view(batch_size, max_targ_len + 1, self.vocab_size) return outputs def generate(self, sents, synts, max_len, sample=True, temp=0.5): batch_size = sents.size(0) max_sent_len = sents.size(1) max_synt_len = synts.size(1) - 2 # count without <sos> and <eos> max_targ_len = max_len # output index starts with <sos> idxs = torch.zeros((batch_size, max_targ_len + 2), dtype=torch.long).cuda() idxs[:, 0] = 1 # sentence, syntax => embedding sent_embeddings = self.embedding_encoder(sents).transpose( 0, 1) * np.sqrt(self.em_size) synt_embeddings = self.embedding_encoder(synts).transpose( 0, 1) * np.sqrt(self.em_size) synt_embeddings = self.pos_encoder(synt_embeddings) en_embeddings = torch.cat((sent_embeddings, synt_embeddings), dim=0) # do not allow cross attetion src_mask = self.generate_square_mask(max_sent_len, max_synt_len).cuda() # starting index => embedding de_embeddings = self.embedding_decoder(idxs[:, :1]).transpose( 0, 1) * np.sqrt(self.em_size) de_embeddings = self.pos_encoder(de_embeddings) # sequential mask tgt_mask = self.transformer.generate_square_subsequent_mask( de_embeddings.size(0)).cuda() # encode memory = self.transformer.encoder(en_embeddings, mask=src_mask) # auto-regressively generate output for i in range(1, max_targ_len + 2): # decode outputs = self.transformer.decoder(de_embeddings, memory, tgt_mask=tgt_mask) outputs = self.linear(outputs[-1].contiguous().view( -1, self.em_size)) # get argmax index or sample index if not sample: values, idx = torch.max(outputs, 1) else: probs = F.softmax(outputs / temp, dim=1) idx = torch.multinomial(probs, 1).squeeze(1) # save to output index idxs[:, i] = idx # concatenate index to decoding de_embeddings = self.embedding_decoder(idxs[:, :i + 1]).transpose( 0, 1) * np.sqrt(self.em_size) de_embeddings = self.pos_encoder(de_embeddings) # new sequential mask tgt_mask = self.transformer.generate_square_subsequent_mask( de_embeddings.size(0)).cuda() return idxs[:, 1:]
class SimpleTransformer(torch.nn.Module): def __init__(self, number_time_series: int, seq_length: int = 48, output_seq_len: int = None, d_model: int = 128, n_heads: int = 8, dropout=0.1, forward_dim=2048, sigmoid=False): """A full transformer model :param number_time_series: The total number of time series present (e.g. n_feature_time_series + n_targets) :type number_time_series: int :param seq_length: The length of your input sequence, defaults to 48 :type seq_length: int, optional :param output_seq_len: The length of your output sequence, defaults to None :type output_seq_len: int, optional :param d_model: The dimensions of your model, defaults to 128 :type d_model: int, optional :param n_heads: The number of heads in each encoder/decoder block, defaults to 8 :type n_heads: int, optional :param dropout: The fraction of dropout you wish to apply during training, defaults to 0.1 (currently not functional) :type dropout: float, optional :param forward_dim: Currently not functional, defaults to 2048 :type forward_dim: int, optional :param sigmoid: Whether to apply a sigmoid activation to the final layer (useful for binary classification), defaults to False :type sigmoid: bool, optional """ super().__init__() if output_seq_len is None: output_seq_len = seq_length self.out_seq_len = output_seq_len self.mask = generate_square_subsequent_mask(seq_length) self.dense_shape = torch.nn.Linear(number_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) self.transformer = Transformer(d_model, nhead=n_heads) self.final_layer = torch.nn.Linear(d_model, 1) self.sequence_size = seq_length self.tgt_mask = generate_square_subsequent_mask(output_seq_len) self.sigmoid = None if sigmoid: self.sigmoid = torch.nn.Sigmoid() def forward(self, x: torch.Tensor, t: torch.Tensor, tgt_mask=None, src_mask=None): x = self.encode_sequence(x[:, :-1, :], src_mask) return self.decode_seq(x, t, tgt_mask) def basic_feature(self, x: torch.Tensor): x = self.dense_shape(x) x = self.pe(x) x = x.permute(1, 0, 2) return x def encode_sequence(self, x, src_mask=None): x = self.basic_feature(x) x = self.transformer.encoder(x, src_mask) return x def decode_seq(self, mem, t, tgt_mask=None, view_number=None) -> torch.Tensor: if view_number is None: view_number = self.out_seq_len if tgt_mask is None: tgt_mask = self.tgt_mask t = self.basic_feature(t) x = self.transformer.decoder(t, mem, tgt_mask=tgt_mask) x = self.final_layer(x) if self.sigmoid: x = self.sigmoid(x) return x.view(-1, view_number)