def __init__(self, embeddings, feat_dim=512, max_word=32, multi_image=1, image_pe=True, layer_norm=False, num_layers=6, teacher_forcing=False, image_model=None, image_pretrained=None, finetune_image=False, image_finetune_epoch=None, rl_opts=None, word_idxs=None, device='gpu', verbose=False): super(Visual_GPTSimpleCaptioner, self).__init__(embeddings, feat_dim, max_word, multi_image, image_pe, layer_norm, teacher_forcing, image_model, image_pretrained, finetune_image, image_finetune_epoch, rl_opts, word_idxs, device, verbose) # Transformer Decoder decoder_layer = TransformerMaxDecoderLayer(feat_dim, nhead=8) self.decoder = TransformerDecoder(decoder_layer, num_layers=num_layers)
def _build_transformer_decoder( d_model: int, nhead: int, num_decoder_layers: int, dim_feedforward: int, dropout: float, ) -> nn.TransformerDecoder: """build transformer decoder with params Parameters ---------- d_model : int nhead : int num_decoder_layers : int dim_feedforward : int dropout : float Returns ------- nn.TransformerDecoder """ decoder_layer = nn.TransformerDecoderLayer( d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, ) decoder_norm = nn.LayerNorm(d_model) decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) return decoder
def __init__(self, embeddings, feat_dim=512, max_word=32, multi_image=1, layer_norm=False, num_memory=40, num_enc_layers=6, num_dec_layers=6, teacher_forcing=False, image_model=None, image_pretrained=None, finetune_image=False, image_finetune_epoch=None, rl_opts=None, word_idxs=None, device='gpu', verbose=False): super(M2Transformer, self).__init__(embeddings, feat_dim, max_word, multi_image, False, layer_norm, teacher_forcing, image_model, image_pretrained, finetune_image, image_finetune_epoch, rl_opts, word_idxs, device, verbose) # Transformer Encoder encoder_layer = TransformerEncoderLayerWithMem(feat_dim, nhead=8, nmem=num_memory) self.encoder = MeshedTransformerEncoder(encoder_layer, num_layers=num_enc_layers) # Transformer Decoder decoder_layer = MeshedTransformerMaxDecoderLayer( feat_dim, nhead=8, nlayer_enc=num_enc_layers) self.decoder = TransformerDecoder(decoder_layer, num_layers=num_dec_layers)
def __init__(self, vocab_size, hidden_size, num_layers, max_len): super(AbsDecoder, self).__init__() self.decoder_embedding = nn.Embedding(vocab_size, hidden_size) from torch.nn.modules.transformer import TransformerDecoder, TransformerDecoderLayer d_model = hidden_size # the number of expected features in the input nhead = 8 # the number of heads in the multiheadattention models dim_feedforward = 2048 dropout = 0.1 self.positional_encoder = PositionalEncoding(d_model, dropout=dropout, max_len=max_len) transformer_decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout) self.transformer_decoder = TransformerDecoder( transformer_decoder_layer, num_layers, norm=None) # Linear & Softmax Layers self.linear_decoder = nn.Linear(in_features=hidden_size, out_features=vocab_size, bias=True) self.logsoftmax_decoder = nn.LogSoftmax(dim=-1)
def __init__(self, embedding, vocab2id, args): super().__init__() self.embedding = embedding self.vocab2id = vocab2id self.args = args self.input_dim = args.input_dim self.head_size = args.target_head_size self.feed_forward_dim = args.feed_forward_dim self.dropout = args.target_dropout self.num_layers = args.target_layers self.target_max_len = args.max_target_len self.max_oov_count = args.max_oov_count self.vocab_size = embedding.num_embeddings layer = TransformerDecoderLayer(d_model=self.input_dim, nhead=self.head_size, dim_feedforward=self.feed_forward_dim, dropout=self.dropout) self.decoder = TransformerDecoder(decoder_layer=layer, num_layers=self.num_layers) self.input_copy_proj = nn.Linear(self.input_dim, self.input_dim, bias=False) self.copy_proj = nn.Linear(self.input_dim, self.input_dim, bias=False) self.embed_proj = nn.Linear(2 * self.input_dim, self.input_dim, bias=False) self.generate_proj = nn.Linear(self.input_dim, self.vocab_size, bias=False)
def __init__(self, input_size, output_size, z_size, depth, params, embedding=None, highway=False, sbn=None, dropout=0., batchnorm=False, residual=None, bidirectional=False, n_mems=20, memory=None, targets=None, nheads=2): super(ConditionalCoattentiveTransformerLink, self).__init__(input_size, output_size, z_size, depth, params, embedding, highway, dropout=dropout, batchnorm=batchnorm, residual=residual) output_size = int(output_size/n_mems) self.input_to_hidden = nn.Linear(input_size, output_size) self.transformer_enc = TransformerEncoder(SpecialTransformerEncoder(output_size, nheads, dim_feedforward=output_size*n_mems, dropout=dropout, activation='gelu', n_mems=n_mems) , depth) self.transformer_dec = TransformerDecoder(TransformerDecoderLayer(output_size, nheads, dim_feedforward=output_size, dropout=dropout, activation='gelu'), depth) self.memory, self.targets = memory, targets self.pe = PositionalEncoding(output_size) self.bn = nn.BatchNorm1d(z_size) self.n_mems, self.output_size = n_mems, output_size self.bidirectional = bidirectional if embedding is not None: self.sbn = sbn if sbn is not None: z_params_size = int(embedding.weight.shape[1] / sbn.n_experts) else: z_params_size = embedding.weight.shape[1] self.hidden_to_z_params = nn.ModuleDict({param: nn.Linear(output_size, z_params_size) for param in params}) else: self.hidden_to_z_params = nn.ModuleDict({param: nn.Linear(output_size, z_size) for param in params}) assert self.residual is None, "Named links still can't have residuals"
def __init__(self, input_size, output_size, z_size, depth, params, embedding=None, highway=False, sbn=None, dropout=0., batchnorm=False, residual=None, bidirectional=False, n_targets=20, nheads=2, sequence=None, memory=None, n_mems=None): super(CoattentiveTransformerLink, self).__init__(input_size, output_size, z_size, depth, params, embedding, highway, dropout=dropout, batchnorm=batchnorm, residual=residual) assert output_size % n_targets == 0 assert z_size % n_targets == 0 output_size = int(output_size/n_targets) self.target = nn.Embedding(n_targets, output_size).weight self.n_mems = n_mems self.memory = memory self.sequence = sequence self.input_to_hidden = nn.Linear(input_size, output_size) self.transformer_dec = TransformerDecoder(TransformerDecoderLayer(output_size, nheads, dim_feedforward=output_size*n_targets, dropout=dropout, activation='gelu'), depth) self.transformer_enc = TransformerEncoder(TransformerEncoderLayer(output_size, nheads, dim_feedforward=output_size, dropout=dropout, activation='gelu'), depth) self.pe = PositionalEncoding(output_size) self.bn = nn.BatchNorm1d(z_size) if embedding is not None: self.sbn = sbn if sbn is not None: z_params_size = int(embedding.weight.shape[1] / sbn.n_experts) else: z_params_size = embedding.weight.shape[1] self.hidden_to_z_params = nn.ModuleDict({param: nn.Linear(output_size, z_params_size) for param in params}) else: self.hidden_to_z_params = nn.ModuleDict({param: nn.Linear(output_size, int(z_size/n_targets)) for param in params})
def __init__(self, ntoken, ninp, nhead, nhid, nlayers, batch_size, dropout=0.5, pretrain_cnn=None, pretrain_emb=None, freeze_cnn=True): super(TransformerModel, self).__init__() self.model_type = 'cnn+transformer' decoder_layers = TransformerDecoderLayer(d_model=nhid, nhead=nhead, dropout=dropout) self.transformer_decoder = TransformerDecoder(decoder_layers, nlayers) self.word_emb = nn.Embedding(ntoken, nhid) self.ninp = ninp self.nhid = nhid self.fc = nn.Linear(512, 512, bias=True) self.fc1 = nn.Linear(512, nhid, bias=True) self.dec_fc = nn.Linear(nhid, ntoken) self.batch_size = batch_size self.ntoken = ntoken self.encoder = Cnn10() self.dropout = nn.Dropout(dropout) self.pos_encoder = PositionalEncoding(nhid, dropout) self.generator = nn.Softmax(dim=-1) self.init_weights() if pretrain_cnn is not None: dict_trained = pretrain_cnn dict_new = self.encoder.state_dict().copy() new_list = list(self.encoder.state_dict().keys()) trained_list = list(dict_trained.keys()) for i in range(len(new_list)): dict_new[new_list[i]] = dict_trained[trained_list[i]] self.encoder.load_state_dict(dict_new) if freeze_cnn: self.freeze_cnn() if pretrain_emb is not None: self.word_emb.weight.data = pretrain_emb
def __init__( self, vocab_size: int, max_seq_len: int, d_model: int, nhead: int, num_layers: int, dropout: float, ): super(Decoder, self).__init__() self.max_seq_len = max_seq_len self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoder = PositionalEncoding(dropout, d_model) decoder_layer = TransformerDecoderLayer(d_model, nhead, 4 * d_model, dropout, norm_first=True) self.decoder = TransformerDecoder(decoder_layer, num_layers, nn.LayerNorm(d_model)) self.output = nn.Linear(d_model, vocab_size)
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=None, dim_feedforward=2048, dropout=0.1, activation="relu"): super(MultiDecodersTransformer, self).__init__() encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) decoders = {} if num_decoder_layers: for k, v in num_decoder_layers.items(): decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation) decoder_norm = LayerNorm(d_model) decoder = TransformerDecoder(decoder_layer, v, decoder_norm) decoders[k] = decoder self.decoders = ModuleDict(decoders.items()) self._reset_parameters() self.d_model = d_model self.nhead = nhead
def __init__(self, decoding_dim: int, target_embedding_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, positional_encoding_max_steps: int = 1000, dropout_prob: float = 0.1) -> None: super().__init__(decoding_dim=decoding_dim, target_embedding_dim=target_embedding_dim, decodes_parallel=True) decoder_layer = TransformerDecoderLayer(decoding_dim, num_attention_heads, feedforward_hidden_dim, dropout_prob) decoder_norm = LayerNorm(decoding_dim) self._decoder = TransformerDecoder(decoder_layer, num_layers, decoder_norm) self._dropout = Dropout(dropout_prob) self._use_positional_encoding = use_positional_encoding self._reset_parameters()
def decoder(self, decoder_layer: nn.Module) -> nn.Module: return TransformerDecoder(decoder_layer, num_layers=6)
def __init__(self, vocab: Vocabulary, metrics_dict_seq: dict, metrics_dict_reg: dict, input_dim=512, num_attention_heads=8, num_encoder_layers=6, num_decoder_layers=6, feedforward_hidden_dim=2048, dropout=0.1, transformer_dropout=0.1, activation='relu', linear_layers_activation='relu', custom_encoder=None, custom_decoder=None, positional_encoding: Optional[str] = None, predict_avg_total_payoff: bool=True, predict_seq: bool = True, attention: Attention = DotProductAttention(), seq_weight_loss: float = 0.5, reg_weight_loss: float = 0.5, batch_size: int = 9, linear_dim: int=None, only_raisha: bool=False, # if not saifa input is given ): super(TransformerBasedModel, self).__init__(vocab) if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer(input_dim, num_attention_heads, feedforward_hidden_dim, transformer_dropout, activation) encoder_norm = LayerNorm(input_dim) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer(input_dim, num_attention_heads, feedforward_hidden_dim, transformer_dropout, activation) decoder_norm = LayerNorm(input_dim) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() self._input_dim = input_dim self.num_attention_heads = num_attention_heads if positional_encoding is None: self._sinusoidal_positional_encoding = False self._positional_embedding = None elif positional_encoding == "sinusoidal": self._sinusoidal_positional_encoding = True self._positional_embedding = None else: raise ValueError( "positional_encoding must be one of None, 'sinusoidal', or 'embedding'" ) if predict_avg_total_payoff: # need attention and regression layer self.attention = attention if linear_dim is not None and predict_seq: # avg_turn_linear models input_dim_attention = linear_dim else: input_dim_attention = input_dim self.linear_after_attention_layer = LinearLayer(input_size=input_dim_attention, output_size=batch_size, activation=linear_layers_activation) self.regressor = LinearLayer(input_size=batch_size, output_size=1, dropout=dropout, activation=linear_layers_activation) self.attention_vector = torch.randn((batch_size, input_dim_attention), requires_grad=True) if torch.cuda.is_available(): self.attention_vector = self.attention_vector.cuda() self.mse_loss = nn.MSELoss() if predict_seq: # need hidden2tag layer if linear_dim is not None: # add linear layer before hidden2tag self.linear_layer = LinearLayer(input_size=input_dim, output_size=linear_dim, dropout=dropout, activation=linear_layers_activation) hidden2tag_input_size = linear_dim else: self.linear_layer = None hidden2tag_input_size = input_dim self.hidden2tag = LinearLayer(input_size=hidden2tag_input_size, output_size=vocab.get_vocab_size('labels'), dropout=dropout, activation=linear_layers_activation) self.metrics_dict_seq = metrics_dict_seq self.metrics_dict_reg = metrics_dict_reg self.seq_predictions = defaultdict(dict) self.reg_predictions = pd.DataFrame() self._epoch = 0 self._first_pair = None self.seq_weight_loss = seq_weight_loss self.reg_weight_loss = reg_weight_loss self.predict_avg_total_payoff = predict_avg_total_payoff self.predict_seq = predict_seq self.only_raisha = only_raisha