def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention( temperature=np.power(d_k, 0.5)) self.layer_norm = LayerNorm(d_model) self.fc = nn.Linear(n_head * d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout)
def __init__(self, seq_length: int, output_seq_length: int, n_time_series: int, d_model=128, output_dim=1, n_layers_encoder=6, forward_dim=2048, dropout=0.1, use_mask=False, meta_data=None, n_heads=8): """ Uses a number of encoder layers with simple linear decoder layer """ super().__init__() self.dense_shape = torch.nn.Linear(n_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) encoder_layer = TransformerEncoderLayer(d_model, 8, forward_dim, dropout) encoder_norm = LayerNorm(d_model) self.transformer_enc = TransformerEncoder(encoder_layer, n_layers_encoder, encoder_norm) self.output_dim_layer = torch.nn.Linear(d_model, output_dim) self.output_seq_length = output_seq_length self.out_length_lay = torch.nn.Linear(seq_length, output_seq_length) self.mask = generate_square_subsequent_mask(seq_length) self.mask_it = use_mask if meta_data: self.meta_merger = MergingModel(meta_data["method"], meta_data["params"])
def __init__(self, input_dim=13, num_classes=9, d_model=64, n_head=2, n_layers=5, d_inner=128, activation="relu", dropout=0.017998950510888446, max_len=200): super(PETransformerModel, self).__init__() self.modelname = f"PeTransformerEncoder_input-dim={input_dim}_num-classes={num_classes}_" \ f"d-model={d_model}_d-inner={d_inner}_n-layers={n_layers}_n-head={n_head}_" \ f"dropout={dropout}" encoder_layer = TransformerEncoderLayer(d_model, n_head, d_inner, dropout, activation) encoder_norm = LayerNorm(d_model) self.inlinear = Linear(input_dim, d_model) self.relu = ReLU() self.transformerencoder = TransformerEncoder(encoder_layer, n_layers, encoder_norm) self.flatten = Flatten() self.outlinear = Linear(d_model, num_classes) self.pe = PositionalEncoding(d_model, max_len=max_len) """
def __init__(self, seq_length: int, output_seq_length: int, n_time_series: int, d_model=128, output_dim=1, n_layers_encoder=6, use_mask=False, n_heads=8): """ Uses a number of encoder layers with simple linear decoder layer """ super().__init__() self.dense_shape = torch.nn.Linear(n_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) encoder_layer = TransformerEncoderLayer(d_model, 8) encoder_norm = LayerNorm(d_model) self.transformer_enc = TransformerEncoder(encoder_layer, n_layers_encoder, encoder_norm) self.output_dim_layer = torch.nn.Linear(d_model, output_dim) self.output_seq_length = output_seq_length self.out_length_lay = torch.nn.Linear(seq_length, output_seq_length) self.mask = generate_square_subsequent_mask(seq_length) self.mask_it = use_mask
def __init__(self, d_model, dropout=0.1, max_len=512): super(PositionalEncodings, self).__init__() self.dropout = nn.Dropout(p=dropout) self.position_embeddings = nn.Embedding(max_len, d_model) self.token_type_embeddings = nn.Embedding(2, d_model) self.embedding_layer_norm = LayerNorm(d_model, eps=1e-12)
def __init__(self, d_model, vocab_size=30522, dropout=0.1, max_len=512): super(BERTStyleEmbedding, self).__init__() self.dropout = nn.Dropout(p=dropout) self.word_embeddings = nn.Embedding(vocab_size, d_model) self.position_embeddings = nn.Embedding(max_len, d_model) self.token_type_embeddings = nn.Embedding(2, d_model) self.embedding_layer_norm = LayerNorm(d_model, eps=1e-12)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, dim_model=300, num_heads=12, dim_feedforward=2048, dropout=0.2): super().__init__() encoder_layer = nn.TransformerEncoderLayer(dim_model, num_heads, dim_feedforward, dropout) encoder_norm = LayerNorm(dim_model) self.transformer = nn.TransformerEncoder(encoder_layer, 1, encoder_norm)
def build_transformer_model(src_vocab_size: int, tgt_vocab_size: int, rnn_size: int = RNN_SIZE, num_head: int = 4, num_layers: int = 3, dim_ff: int = 1024, dropout: float = DROPOUT) -> EncoderDecoder: """ Build transformer model based on the paper "Attention Is All You Need". Arguments: src_vocab_size: vocab size for encoder tgt_vocab_size: vocab size for decoder rnn_size: size of RNN hidden states in encoder/decoder num_head: the number of heads in the multi headed attention num_layers: number of encoder/decoder layers dim_ff: the dimension of the feed forward layer dropout: the dropout probability value """ # Build encoder encoder_layer = TransformerEncoderLayer(rnn_size, num_head, dim_ff, dropout) encoder_norm = LayerNorm(rnn_size) encoder = TransformerEncoder(encoder_layer, num_layers, encoder_norm) # Build decoder decoder_layer = TransformerDecoderLayer(rnn_size, num_head, dim_ff, dropout) decoder_norm = LayerNorm(rnn_size) decoder = TransformerDecoder(decoder_layer, num_layers, decoder_norm) # Build generator generator = Generator(rnn_size, tgt_vocab_size) return EncoderDecoder(encoder, decoder, generator, rnn_size, src_vocab_size, tgt_vocab_size)
def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = "relu", custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None) -> None: super(Transformer, self).__init__() if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__(self, input_dim=13, num_classes=9, sequencelength=13, d_model=64, n_head=1, n_layers=3, d_inner=256, activation="relu", dropout=0.39907201621346594): super(TransformerModel, self).__init__() self.modelname = f"TransformerEncoder_input-dim={input_dim}_num-classes={num_classes}_" \ f"d-model={d_model}_d-inner={d_inner}_n-layers={n_layers}_n-head={n_head}_" \ f"dropout={dropout}" encoder_layer = TransformerEncoderLayer(d_model, n_head, d_inner, dropout, activation) encoder_norm = LayerNorm(d_model) self.sequential = Sequential( Linear(input_dim, d_model), ReLU(), TransformerEncoder(encoder_layer, n_layers, encoder_norm), Flatten(), ReLU(), Linear(d_model * sequencelength, num_classes))
def __init__(self, seq_length: int, output_seq_length: int, n_time_series: int, d_model=128, output_dim=1, n_layers_encoder=6, forward_dim=2048, dropout=0.1, use_mask=False, meta_data=None, final_act=None, squashed_embedding=False, n_heads=8): """Uses a number of encoder layers with simple linear decoder layer. :param seq_length: The number of historical time-steps fed into the model in each forward pass. :type seq_length: int :param output_seq_length: The number of forecasted time-steps outputted by the model. :type output_seq_length: int :param n_time_series: The total number of time series present (targets + features) :type n_time_series: int :param d_model: The embedding dim of the mode, defaults to 128 :type d_model: int, optional :param output_dim: The output dimension (should correspond to n_targets), defaults to 1 :type output_dim: int, optional :param n_layers_encoder: The number of encoder layers, defaults to 6 :type n_layers_encoder: int, optional :param forward_dim: The forward embedding dim, defaults to 2048 :type forward_dim: int, optional :param dropout: How much dropout to use, defaults to 0.1 :type dropout: float, optional :param use_mask: Whether to use subsquent sequence mask during training, defaults to False :type use_mask: bool, optional :param meta_data: Whether to use static meta-data, defaults to None :type meta_data: str, optional :param final_act: Whether to use a final activation function, defaults to None :type final_act: str, optional :param squashed_embedding: Whether to create a one 1-D time embedding, defaults to False :type squashed_embedding: bool, optional :param n_heads: [description], defaults to 8 :type n_heads: int, optional """ super().__init__() self.dense_shape = torch.nn.Linear(n_time_series, d_model) self.pe = SimplePositionalEncoding(d_model) encoder_layer = TransformerEncoderLayer(d_model, 8, forward_dim, dropout) encoder_norm = LayerNorm(d_model) self.transformer_enc = TransformerEncoder(encoder_layer, n_layers_encoder, encoder_norm) self.output_dim_layer = torch.nn.Linear(d_model, output_dim) self.output_seq_length = output_seq_length self.out_length_lay = torch.nn.Linear(seq_length, output_seq_length) self.mask = generate_square_subsequent_mask(seq_length) self.out_dim = output_dim self.mask_it = use_mask self.final_act = None self.squashed = None if final_act: self.final_act = activation_dict[final_act] if meta_data: self.meta_merger = MergingModel(meta_data["method"], meta_data["params"]) if squashed_embedding: self.squashed = torch.nn.Linear(seq_length, 1) self.unsquashed = torch.nn.Linear(1, seq_length)
def __init__(self, d_in, d_hid, dropout=0.1): super().__init__() self.w_1 = nn.Linear(d_in, d_hid) # position-wise self.w_2 = nn.Linear(d_hid, d_in) # position-wise self.layer_norm = LayerNorm(d_in) self.dropout = nn.Dropout(dropout)