def __init__(self, __C): super(QAtt, self).__init__() self.__C = __C self.mlp = MLP(in_size=__C.LSTM_OUT_SIZE, mid_size=__C.HIDDEN_SIZE, out_size=__C.Q_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True)
def __init__(self, __C): super(FFN, self).__init__() self.mlp = MLP(in_size=__C.HIDDEN_SIZE, mid_size=__C.FF_SIZE, out_size=__C.HIDDEN_SIZE, dropout_r=__C.DROPOUT_R, use_relu=True)
def __init__(self, __C): super(AGAttention, self).__init__() self.lin_v = FFN(__C) # let self.lin take care of bias self.lin_q = FFN(__C) self.lin = MLP(in_size=__C.HIDDEN_SIZE, mid_size=__C.FF_SIZE, out_size=1, dropout_r=__C.DROPOUT_R, use_relu=True)
def __init__(self, __C): super(AttFlat, self).__init__() self.__C = __C self.mlp = MLP(in_size=__C.HIDDEN_SIZE, mid_size=__C.FLAT_MLP_SIZE, out_size=__C.FLAT_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True) self.linear_merge = nn.Linear(__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES, __C.FLAT_OUT_SIZE)
def __init__(self, __C, q_emb, token_size, answer_size): super(QNet, self).__init__() # self.attflat_lang = AttFlat(__C) self.mlp = MLP( in_size=__C.FLAT_OUT_SIZE, # 1024 mid_size=__C.FLAT_OUT_SIZE, # 1024 out_size=answer_size, dropout_r=__C.DROPOUT_R, use_relu=True) self.proj_norm = LayerNorm(answer_size) self.proj = nn.Linear(answer_size, answer_size)
def __init__(self, __C, img_feat_size, ques_att_feat_size, gen_func): super(IAtt, self).__init__() self.__C = __C self.dropout = nn.Dropout(__C.DROPOUT_R) self.mfb = MFB(__C, img_feat_size, ques_att_feat_size, True) self.mlp = MLP(in_size=__C.MFB_O, mid_size=__C.HIDDEN_SIZE, out_size=__C.I_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True) if str(gen_func) == 'tvmax': self.gen_func = 'tvmax' self.sparsemax = partial(sparsemax, k=512) self.tvmax = TV2DFunction.apply else: self.gen_func = gen_func
def __init__(self, __C, gen_func=torch.softmax): super(AttFlatText, self).__init__() self.__C = __C self.gen_func = gen_func if str(gen_func) == 'tvmax': self.sparsemax = partial(sparsemax, k=512) self.tvmax = TV2DFunction.apply self.mlp = MLP(in_size=__C.HIDDEN_SIZE, mid_size=__C.FLAT_MLP_SIZE, out_size=__C.FLAT_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True) self.linear_merge = nn.Linear(__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES, __C.FLAT_OUT_SIZE)
def __init__(self, __C, pretrained_emb, token_size, answer_size): super(Net, self).__init__() copy_data = __C self.embedding = nn.Embedding( num_embeddings=token_size, embedding_dim=__C.WORD_EMBED_SIZE ) self.mlp = MLP( in_size=__C.HIDDEN_SIZE, mid_size=__C.FLAT_MLP_SIZE, out_size=__C.FLAT_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True ) # Loading the GloVe embedding weights if __C.USE_GLOVE: self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) self.lstm = nn.LSTM( input_size=__C.WORD_EMBED_SIZE, hidden_size=__C.HIDDEN_SIZE, num_layers=1, batch_first=True ) self.img_feat_linear = nn.Linear( __C.IMG_FEAT_SIZE, 2048 ) self.backbone = MCA_ED(__C) self.attflat_img = AttFlat(__C) self.attflat_lang = AttFlat(__C) self.proj_norm = LayerNorm(1024) self.proj = nn.Linear(1024, answer_size) self.dense_coattn = DCNLayer(2048, 1024, 4, 3, 5, 0.3) self.predict = PredictLayer(2048, 1024, 4, 3129, 0.3) self.apply(Initializer.xavier_normal)
def __init__(self, __C, gen_func=torch.softmax): super(AttFlat, self).__init__() self.__C = __C self.attention = __C.attention self.gen_func = gen_func if str(gen_func) == 'tvmax': self.sparsemax = partial(sparsemax, k=512) self.tvmax = TV2DFunction.apply self.mlp = MLP(in_size=__C.HIDDEN_SIZE, mid_size=__C.FLAT_MLP_SIZE, out_size=__C.FLAT_GLIMPSES, dropout_r=__C.DROPOUT_R, use_relu=True) self.linear_merge = nn.Linear(__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES, __C.FLAT_OUT_SIZE) if (self.attention == 'cont-sparsemax'): self.transform = ContinuousSparsemax( psi=None ) # use basis functions in 'psi' to define continuous sparsemax else: self.transform = ContinuousSoftmax( psi=None ) # use basis functions in 'psi' to define continuous softmax device = 'cuda' # compute F and G offline for one length = 14*14 = 196 self.Gs = [None] self.psi = [None] max_seq_len = 14 * 14 # 196 grid features attn_num_basis = 100 # 100 basis functions nb_waves = attn_num_basis self.psi.append([]) self.add_gaussian_basis_functions(self.psi[1], nb_waves, device=device) # stack basis functions padding = True length = max_seq_len if padding: shift = 1 / float(2 * math.sqrt(length)) positions_x = torch.linspace(-0.5 + shift, 1.5 - shift, int(2 * math.sqrt(length))) positions_x, positions_y = torch.meshgrid(positions_x, positions_x) positions_x = positions_x.flatten() positions_y = positions_y.flatten() else: shift = 1 / float(2 * math.sqrt(length)) positions_x = torch.linspace(shift, 1 - shift, int(math.sqrt(length))) positions_x, positions_y = torch.meshgrid(positions_x, positions_x) positions_x = positions_x.flatten() positions_y = positions_y.flatten() positions = torch.zeros(len(positions_x), 2, 1).to(device) for position in range(1, len(positions_x) + 1): positions[position - 1] = torch.tensor( [[positions_x[position - 1]], [positions_y[position - 1]]]) F = torch.zeros(nb_waves, positions.size(0)).unsqueeze(2).unsqueeze(3).to( device) # torch.Size([N, 196, 1, 1]) # print(positions.size()) # torch.Size([196, 2, 1]) basis_functions = self.psi[1][0] # print(basis_functions.evaluate(positions[0]).size()) # torch.Size([N, 1, 1]) for i in range(0, positions.size(0)): F[:, i] = basis_functions.evaluate(positions[i])[:] penalty = .01 # Ridge penalty I = torch.eye(nb_waves).to(device) F = F.squeeze(-2).squeeze(-1) # torch.Size([N, 196]) G = F.t().matmul( (F.matmul(F.t()) + penalty * I).inverse()) # torch.Size([196, N]) if padding: G = G[length:-length, :] G = torch.cat([ G[7:21, :], G[35:49, :], G[63:77, :], G[91:105, :], G[119:133, :], G[147:161, :], G[175:189, :], G[203:217, :], G[231:245, :], G[259:273, :], G[287:301, :], G[315:329, :], G[343:357, :], G[371:385, :] ]) self.Gs.append(G.to(device))