def __init__(self, vocab_size, embed_size, hidden_size, memory_bank_size, copy_attn, pad_idx, dropout, cur_model): super(RNNDecoder, self).__init__() self.embed_size = embed_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.memory_bank_size = memory_bank_size self.dropout = nn.Dropout(dropout) self.copy_attn = copy_attn self.pad_token = pad_idx self.cur_model = cur_model self.use_img = 'img' in self.cur_model self.use_attr = 'attr' in self.cur_model self.embedding = nn.Embedding(self.vocab_size, self.embed_size, self.pad_token) self.input_size = embed_size self.rnn = nn.GRU(input_size=self.input_size, hidden_size=hidden_size, num_layers=1, bidirectional=False, batch_first=False) self.attention_layer = Attention(decoder_size=hidden_size, memory_bank_size=memory_bank_size, need_mask=True) self.combine_pred = 'combine' in cur_model self.combine_pred_type = 'direct' if 'direct' in cur_model else 'embed' if self.combine_pred: if self.combine_pred_type == 'embed': self.pred_att = Attention(decoder_size=hidden_size, memory_bank_size=memory_bank_size, need_mask=True) self.cls_pred_p_gen_linear = nn.Linear( embed_size + hidden_size + memory_bank_size, 1) if copy_attn: self.p_gen_linear = nn.Linear( embed_size + hidden_size + memory_bank_size, 1) self.sigmoid = nn.Sigmoid() self.vocab_dist_linear_1 = nn.Linear(hidden_size + memory_bank_size, hidden_size) self.vocab_dist_linear_2 = nn.Linear(hidden_size, vocab_size) self.softmax = MaskedSoftmax(dim=1)
def __init__(self, layer_size=8, input_channels=3, att=False): super().__init__() self.layer_size = layer_size self.enc_1 = VSRLayer(3, 64, kernel_size=7) self.enc_2 = VSRLayer(64, 128, kernel_size=5) self.enc_3 = PConvLayer(128, 256, sample='down-5') self.enc_4 = PConvLayer(256, 512, sample='down-3') for i in range(4, self.layer_size): name = 'enc_{:d}'.format(i + 1) setattr(self, name, PConvLayer(512, 512, sample='down-3')) self.deconv = nn.ConvTranspose2d(512, 512, 4, 2, 1) for i in range(4, self.layer_size): name = 'dec_{:d}'.format(i + 1) setattr(self, name, PConvLayer(512 + 512, 512, activ='leaky', deconv=True)) self.dec_4 = PConvLayer(512 + 256, 256, activ='leaky', deconv=True) if att: self.att = Attention.AttentionModule() else: self.att = lambda x: x self.dec_3 = PConvLayer(256 + 128, 128, activ='leaky', deconv=True) self.dec_2 = VSRLayer(128 + 64, 64, stride=1, activation='leaky', deconv=True) self.dec_1 = VSRLayer(64 + input_channels, 64, stride=1, activation=None, batch_norm=False) self.resolver = Bottleneck(64, 16) self.output = nn.Conv2d(128, 3, 1)
def __init__(self, vocab, config, pretrained_embedding): super(HANEncoder, self).__init__() self.config = config self.word_embed = nn.Embedding(vocab.vocab_size, config.word_dims, padding_idx=0) self.extword_embed = nn.Embedding(vocab.extvocab_size, config.word_dims, padding_idx=0) word_init = np.zeros((vocab.vocab_size, config.word_dims), dtype=np.float32) self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) self.extword_embed.weight.data.copy_( torch.from_numpy(pretrained_embedding)) self.extword_embed.weight.requires_grad = False self.role_embed = nn.Embedding(vocab.role_size, config.role_dims, padding_idx=0) role_init = np.zeros((vocab.role_size, config.role_dims), dtype=np.float32) self.role_embed.weight.data.copy_(torch.from_numpy(role_init)) self.sent_lstm = MyLSTM( input_size=config.word_dims, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.turn_lstm = MyLSTM( input_size=config.lstm_hiddens * 2, hidden_size=config.lstm_hiddens, num_layers=config.lstm_layers, batch_first=True, bidirectional=True, dropout_in=config.dropout_lstm_input, dropout_out=config.dropout_lstm_hidden, ) self.sent_att = Attention(config.lstm_hiddens * 2) self.turn_att = Attention(config.lstm_hiddens * 2)
def __init__(self, embedding_size, hidden_size, seq_len, n_glimpses, tanh_exploration): super(RNNTSP, self).__init__() self.embedding_size = embedding_size self.hidden_size = hidden_size self.n_glimpses = n_glimpses self.seq_len = seq_len self.embedding = GraphEmbedding(2, embedding_size) self.encoder = nn.LSTM(embedding_size, hidden_size, batch_first=True) self.decoder = nn.LSTM(embedding_size, hidden_size, batch_first=True) self.pointer = Attention(hidden_size, C=tanh_exploration) self.glimpse = Attention(hidden_size) self.decoder_start_input = nn.Parameter( torch.FloatTensor(embedding_size)) self.decoder_start_input.data.uniform_( -(1. / math.sqrt(embedding_size)), 1. / math.sqrt(embedding_size))
def __init__(self, class_num, word_embedding_matrix, position1_embedding_matrix, \ position2_embedding_matrix, filters, kernel_size, padding=0, activation=nn.Tanh(), \ dropout=0.5, nis_hidden_dims=[]): super(APCNN_NIS, self).__init__() self.filters = filters word_dim = word_embedding_matrix.shape[1] self.embedding = Embedding(word_embedding_matrix, position1_embedding_matrix, position2_embedding_matrix) self.pcnn = PCNN(1, filters, kernel_size, padding=padding, activation=activation, dropout=dropout) self.attention = Attention(3 * filters + word_dim, activation=activation) self.attention_weight = None self.nis = NIS(3 * filters, nis_hidden_dims) self.linear = nn.Linear(3 * filters, class_num, bias=True)
def __init__(self, config, device): super(CRNNet, self).__init__() self.config = config self.stages = { 'Trans': config['transform'], 'Feat': config['backbone'], 'Seq': config['sequence'], 'Pred': config['prediction'] } if config['transform'] == 'TPS': self.Transformation = TPS_SpatialTransformerNetwork( F=config['num_fiducial'], im_size=(config['height'], config['width']), im_rectified=(config['height'], config['width']), num_channels=config['input_channel'], device=device) else: print('No tps specified') if config['backbone'] == 'ResNet': self.FeatureExtraction = ResNet_FeatureExtractor( config['input_channel'], config['output_channel']) else: raise Exception('No backbone module specified') self.FeatureExtraction_output = config[ 'output_channel'] # int(imgH/16-1) * 512 self.AdaptiveAvgPool = nn.AdaptiveAvgPool2d( (None, 1)) # Transform final (imgH/16-1) -> 1 if config['sequence'] == 'biLSTM': self.SequenceModeling = nn.Sequential( BidirectionalLSTM(self.FeatureExtraction_output, config['hidden_size'], config['hidden_size']), BidirectionalLSTM(config['hidden_size'], config['hidden_size'], config['hidden_size'])) self.SequenceModeling_output = config['hidden_size'] else: print('No sequence module specified') self.SequenceModeling_output = self.FeatureExtraction_output if config['prediction'] == 'CTC': self.Prediction = nn.Linear(self.SequenceModeling_output, config['num_classes']) elif config['prediction'] == 'Attention': self.Prediction = Attention(self.SequenceModeling_output, config['hidden_size'], config['num_classes'], device=device) else: raise Exception( 'prediction needs to be either CTC or attention-based sequence prediction' )
def __init__( self, dim, depth, heads, mlp_ratio=4.0, attn_dropout=0.0, dropout=0.0, qkv_bias=True, revised=False, ): super().__init__() self.layers = nn.ModuleList([]) assert isinstance( mlp_ratio, float ), "MLP ratio should be an integer for valid " mlp_dim = int(mlp_ratio * dim) for _ in range(depth): self.layers.append( nn.ModuleList( [ PreNorm( dim, Attention( dim, num_heads=heads, qkv_bias=qkv_bias, attn_drop=attn_dropout, proj_drop=dropout, ), ), PreNorm( dim, FeedForward(dim, mlp_dim, dropout_rate=dropout,), ) if not revised else FeedForward( dim, mlp_dim, dropout_rate=dropout, revised=True, ), ] ) )
def __init__(self, input_size, hidden_size, text_embed_size, reduction_factor=2): super(Decoder_Mel, self).__init__() self.text_embed_size = text_embed_size self.prenet = Prenet(input_size=input_size, hidden_size=hps.prenet_size[0], output_size=hidden_size // 2, dropout_rate=hps.prenet_dropout_rate) self.attnRNN = AttentionRNN(input_size=hidden_size // 2, hidden_size=hidden_size, text_embed_size=text_embed_size) self.attn = Attention(query_size=hidden_size, context_size=text_embed_size) self.decRNN = DecoderRNN(input_size=hidden_size + text_embed_size, output_size=hps.n_mels, r=reduction_factor)
def __init__(self, rnn_hidden_dims, attn_hidden_dims, hops, word_embedding): super(SelfAttentionNetwork, self).__init__() self._rnn_hidden_dims = rnn_hidden_dims self._attn_hidden_dims = attn_hidden_dims self._device = None self.embedding_dims = self._rnn_hidden_dims * 2 self.hops = hops self._embedding = word_embedding self._embedding_dims = self._embedding._embedding_dim self._birnn = BiRNN(self._embedding_dims, self._rnn_hidden_dims) self._attention = Attention(self._rnn_hidden_dims * 2, self._attn_hidden_dims, self.hops)
def __init__(self, vocab_size, encoder_dim): super(Decoder, self).__init__() self.hidden_size = HIDDEN_SIZE self.encoder_dim = encoder_dim self.num_pixels = FEATURE_MAP_DIM * FEATURE_MAP_DIM self.embedding_size = EMBEDDING_SIZE self.attention_dim = ATTENTION_DIM self.vocab_size = vocab_size self.num_layers = NUM_LAYERS_DECODER self.embedding = nn.Embedding(vocab_size, self.embedding_size, padding_idx=3).to(DEVICE) self.attention = Attention(encoder_dim=self.encoder_dim, decoder_dim=self.hidden_size, attention_dim=self.attention_dim) self.f_beta = nn.Linear(self.hidden_size, self.encoder_dim) # linear layer to create a sigmoid-activated gate self.sigmoid = nn.Sigmoid() self.input_drop = VariationalDropout(INPUT_DROPOUT, batch_first=True) self.output_drop = VariationalDropout(OUTPUT_DROPOUT, batch_first=True) # TODO: change to LSTMCell! (Multiple Layers?) self.decode_step = nn.LSTMCell(self.embedding_size + self.encoder_dim, self.hidden_size, bias=True)
def __init__(self, opt): """Initialize model.""" super(MultimodalEncoder, self).__init__() self.data_path = opt.data_path self.emb_path = opt.emb_path self.bidirectional = opt.bidirectional self.num_directions = 2 if self.bidirectional else 1 self.hidden_size = opt.hidden_size self.bi_hidden_size = self.num_directions * opt.hidden_size opt.bi_hidden_size = self.bi_hidden_size self.cur_model = opt.cur_model self.use_text = opt.use_text assert self.use_text self.use_img = opt.use_img self.use_attr = opt.use_attr self.img_ext_model = opt.img_ext_model self.text_pooling_type = 'avg' if 'avg_text' in opt.cur_model else 'max' # default is max self.img_pooling_type = 'max' if 'max_img' in opt.cur_model else 'avg' # default is avg self.attr_pooling_type = 'avg' if 'avg_attr' in opt.cur_model else 'max' # default is max self.embedding = nn.Embedding(opt.vocab_size, opt.emb_size, opt.pad_idx) self.init_weights(opt.emb_type, opt.pad_idx) self.rnn = nn.GRU(input_size=opt.emb_size, hidden_size=opt.hidden_size, num_layers=opt.num_layers, bidirectional=opt.bidirectional, batch_first=True, dropout=opt.dropout) if 'text_self_att' in self.cur_model: self.attention = Attention(self.bi_hidden_size, self.bi_hidden_size, need_mask=True) if 'text_self_co_att' in self.cur_model: self.text_self_co_att = CoAttention(self.bi_hidden_size, self.bi_hidden_size, input_type='text_text') if self.use_img: # resnet/butd: 2048, vgg: 512 self.raw_img_feat_size = 2048 if 'resnet' in opt.img_ext_model or 'butd' in opt.img_ext_model else 512 self.linear_img = nn.Linear(self.raw_img_feat_size, self.bi_hidden_size) # single-attention if 'text_img_att' in self.cur_model: self.text_img_att = Attention(self.bi_hidden_size, self.bi_hidden_size) if 'img_text_att' in self.cur_model: self.img_text_att = Attention(self.bi_hidden_size, self.bi_hidden_size, need_mask=True) if 'text_img_add_text_att' in self.cur_model: self.text_img_add_text_att = Attention(2 * self.bi_hidden_size, self.bi_hidden_size, need_mask=True) if 'text_img_co_att' in self.cur_model: self.text_img_co_att = CoAttention(self.bi_hidden_size, self.bi_hidden_size, input_type='text_img') if 'img_text_co_att' in self.cur_model: self.img_text_co_att = CoAttention(self.bi_hidden_size, self.bi_hidden_size, input_type='img_text') # co-attention if 'multi_head_att' in self.cur_model: # ['img_text_multi_head_att_h4_d256', 'text_img_multi_head_att_h4_d256',] # We hard code the head number and the dimension of the subspace into model name # 'img_text_multi_head_att_h1_d128'==> head: 1, dim: 128 # default setting self.is_regu = True if 'regu' in self.cur_model else False n_head, d_kv, stack_num = get_multi_head_att_paras(self.cur_model) if 'img_text_multi_head_att' in self.cur_model: self.img_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True, is_regu=self.is_regu) for _ in range(stack_num) ]) elif 'text_img_multi_head_att' in self.cur_model: self.text_img_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=False) for _ in range(stack_num) ]) elif 'attr_text_multi_head_att' in self.cur_model: self.attr_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True) for _ in range(stack_num) ]) elif 'img_attr_add_text_multi_head_att' in self.cur_model: self.img_attr_add_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True) for _ in range(stack_num) ]) elif 'img_attr_sep_text_multi_head_att' in self.cur_model: self.img_sep_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True) for _ in range(stack_num) ]) self.attr_sep_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True) for _ in range(stack_num) ]) elif 'text_text_multi_head_att' in self.cur_model: self.text_text_multi_head_att = nn.ModuleList([ MyMultiHeadAttention(n_head, self.bi_hidden_size, d_kv, dropout=opt.dropout, need_mask=True) for _ in range(stack_num) ]) else: raise NotImplementedError if self.use_attr: self.linear_attr = nn.Linear(opt.emb_size, self.bi_hidden_size) if 'text_attr_att' in self.cur_model: self.text_attr_att = Attention(self.bi_hidden_size, self.bi_hidden_size) if 'attr_text_att' in self.cur_model: self.attr_text_att = Attention(self.bi_hidden_size, self.bi_hidden_size, need_mask=True) if 'text_attr_add_text_att' in self.cur_model: self.text_attr_add_text_att = Attention(2 * self.bi_hidden_size, self.bi_hidden_size, need_mask=True) if 'text_attr_real_add_text_att' in self.cur_model: self.text_attr_real_add_text_att = Attention( self.bi_hidden_size, self.bi_hidden_size, need_mask=True) elif 'text_attr_co_att' in self.cur_model: self.text_attr_co_att = CoAttention(self.bi_hidden_size, self.bi_hidden_size, input_type='text_img') elif 'attr_text_co_att' in self.cur_model: self.attr_text_co_att = CoAttention(self.bi_hidden_size, self.bi_hidden_size, input_type='img_text') self.dropout = nn.Dropout(p=opt.dropout)