def bilinear_distance(self, vec1, vec2): reshape_vec1 = L.Reshape(vec1, shape=dict(dim=[self.batch_size, -1, 1, 1])) reshape_vec2 = L.Reshape(vec2, shape=dict(dim=[self.batch_size, -1, 1, 1])) bilinear = L.CompactBilinear(reshape_vec1, reshape_vec2) signed = L.SignedSqrt(bilinear) l2_normalize = L.L2Normalize(signed) score = L.InnerProduct( l2_normalize, num_output=1, weight_filler=self.uniform_weight_filler(-0.08, .08), param=self.learning_params([[1, 1], [2, 0]], ['bilinear_dist', 'bilinear_dist_b'])) return score
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 ) # char embedding n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_c_scale = L.Scale(n.embed_c, n.cont1, scale_param=dict(dict(axis=0))) n.embed_c_scale_resh = L.Reshape( n.embed_c_scale, reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c * T, -1]))) # N x 1 x T_c x d_c tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2}) for i in xrange(T): n.__setattr__('slice_' + str(i + 1), tops[int(i)]) # char conv n.c_feature_1 = L.Convolution( n.slice_1, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_2 = L.Convolution( n.slice_2, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_3 = L.Convolution( n.slice_3, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_4 = L.Convolution( n.slice_4, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_5 = L.Convolution( n.slice_5, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_6 = L.Convolution( n.slice_6, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_7 = L.Convolution( n.slice_7, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_8 = L.Convolution( n.slice_8, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_9 = L.Convolution( n.slice_9, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_10 = L.Convolution( n.slice_10, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_11 = L.Convolution( n.slice_11, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_12 = L.Convolution( n.slice_12, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_13 = L.Convolution( n.slice_13, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_14 = L.Convolution( n.slice_14, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_15 = L.Convolution( n.slice_15, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_16 = L.Convolution( n.slice_16, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_17 = L.Convolution( n.slice_17, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_18 = L.Convolution( n.slice_18, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_19 = L.Convolution( n.slice_19, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_20 = L.Convolution( n.slice_20, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_21 = L.Convolution( n.slice_21, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_22 = L.Convolution( n.slice_22, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_vec_1 = L.Pooling(n.c_feature_1, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_2 = L.Pooling(n.c_feature_2, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_3 = L.Pooling(n.c_feature_3, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_4 = L.Pooling(n.c_feature_4, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_5 = L.Pooling(n.c_feature_5, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_6 = L.Pooling(n.c_feature_6, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_7 = L.Pooling(n.c_feature_7, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_8 = L.Pooling(n.c_feature_8, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_9 = L.Pooling(n.c_feature_9, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_10 = L.Pooling(n.c_feature_10, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_11 = L.Pooling(n.c_feature_11, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_12 = L.Pooling(n.c_feature_12, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_13 = L.Pooling(n.c_feature_13, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_14 = L.Pooling(n.c_feature_14, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_15 = L.Pooling(n.c_feature_15, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_16 = L.Pooling(n.c_feature_16, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_17 = L.Pooling(n.c_feature_17, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_18 = L.Pooling(n.c_feature_18, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_19 = L.Pooling(n.c_feature_19, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_20 = L.Pooling(n.c_feature_20, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_21 = L.Pooling(n.c_feature_21, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_22 = L.Pooling(n.c_feature_22, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_embed_1 = L.Reshape( n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_2 = L.Reshape( n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_3 = L.Reshape( n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_4 = L.Reshape( n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_5 = L.Reshape( n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_6 = L.Reshape( n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_7 = L.Reshape( n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_8 = L.Reshape( n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_9 = L.Reshape( n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_10 = L.Reshape( n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_11 = L.Reshape( n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_12 = L.Reshape( n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_13 = L.Reshape( n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_14 = L.Reshape( n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_15 = L.Reshape( n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_16 = L.Reshape( n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_17 = L.Reshape( n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_18 = L.Reshape( n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_19 = L.Reshape( n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_20 = L.Reshape( n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_21 = L.Reshape( n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_22 = L.Reshape( n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\ n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22] n.concat_char_embed = L.Concat(*concat_c_embed, concat_param={'axis': 1}) # N x T x d_c # word embedding n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w # combine word and char embedding concat_word_embed = [n.embed_w, n.concat_char_embed] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # N x T x (d_c+d_w) n.embed_scale = L.Scale(n.concat_embed, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape( n.embed_scale, reshape_param=dict(shape=dict( dim=[batchsize, 1, T, -1]))) # N x 1 x T x (d_c+d_w) # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0))) # n.glove_scale_resh = L.Reshape(n.glove_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300 # convolution n.word_feature_2 = L.Convolution( n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x C x ? x 1 n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_relu_2 = L.ReLU(n.word_feature_2) n.word_relu_3 = L.ReLU(n.word_feature_3) n.word_relu_4 = L.ReLU(n.word_feature_4) n.word_relu_5 = L.ReLU(n.word_feature_5) n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T + 1, kernel_w=1, stride=T + 1, pool=P.Pooling.MAX) # N x C x 1 x 1 n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T + 2, kernel_w=1, stride=T + 2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T + 3, kernel_w=1, stride=T + 3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T + 4, kernel_w=1, stride=T + 4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 4C x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): #prototxt 없이 network 생성시 사용 n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) #지정된 Python 모듈 형식 #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe #해당 클래스를 바탕으로 Layer를 생성하며 #리턴된 변수에 값을 채워넣으면 자동으로 Run된다. #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐. #Glove = Global vectors for word representation #https://www.aclweb.org/anthology/D14-1162 #Pretrained 된 GloveVector를 Concat에 사용. #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector. n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) #module = python 파일이름 #layer = layer형식이 맞춰진 python class #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다 #ntop = 각 setup , forward backward의 top 변수의 크기 #보통 textual Embed의 뜻은 => texture -> number #Embed 3000개의 Vector종류를 #300개로 compact하게 표현함 n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) #Tanh 적용 n.embed = L.TanH(n.embed_ba) #Glove Data와 Concat concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조. # give top2[~] the name specified by argument `slice_second` #변수 부여 기능 for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) #마지막 LSTM output을 사용. n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) #lstm1의 output => 1024 reshape뒤 dropout #lstm2의 output => 1024 reshape뒤 dropout #concat n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1) =>2048,14 n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) #논문 그림과 달리 Dropout 추가 n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) #논문 그림과 달리 output dim이 2 n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) #softmax로 attentionmap 생성 #14x14 Softmax map이 2개 생성 n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) #두가지 att_map을 각각 Slice att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) #각각 ATT를 곱한값을 연산뒤 Concat한다. # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #그뒤 4096으로 Reshape n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #논문과 달리 가로축 세로축 inputVector크기가 다름 #논문 2048 2048 #코드 4096 2048 n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) #SignedSqrt n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) #L2_Normalize n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) #Dropout n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) #FullyConnected n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) # word embedding n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # n.embed = L.TanH(n.embed_ba) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape(n.embed_scale,\ reshape_param=dict(\ shape=dict(dim=[batchsize,1,T,300]))) # Convolution n.word_feature_2 = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_2_g = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_3_g = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4_g = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5_g = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_2_acti = L.TanH(n.word_feature_2) n.word_3_acti = L.TanH(n.word_feature_3) n.word_4_acti = L.TanH(n.word_feature_4) n.word_5_acti = L.TanH(n.word_feature_5) n.word_2_gate = L.Sigmoid(n.word_feature_2_g) n.word_3_gate = L.Sigmoid(n.word_feature_3_g) n.word_4_gate = L.Sigmoid(n.word_feature_4_g) n.word_5_gate = L.Sigmoid(n.word_feature_5_g) n.word_2 = L.Eltwise(n.word_2_acti, n.word_2_gate, operation=P.Eltwise.PROD) n.word_3 = L.Eltwise(n.word_3_acti, n.word_3_gate, operation=P.Eltwise.PROD) n.word_4 = L.Eltwise(n.word_4_acti, n.word_4_gate, operation=P.Eltwise.PROD) n.word_5 = L.Eltwise(n.word_5_acti, n.word_5_gate, operation=P.Eltwise.PROD) n.word_vec_2 = L.Pooling(n.word_2, kernel_h=T + 1, kernel_w=1, stride=T + 1, pool=P.Pooling.MAX) n.word_vec_3 = L.Pooling(n.word_3, kernel_h=T + 2, kernel_w=1, stride=T + 2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_4, kernel_h=T + 3, kernel_w=1, stride=T + 3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_5, kernel_h=T + 4, kernel_w=1, stride=T + 4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 4*d_w x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size, embed_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) # word embedding (static + dynamic) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # N x T x d_w n.embed_scale_resh = L.Reshape( n.embed_scale, reshape_param=dict(shape=dict(dim=[batchsize, T, embed_size, 1]))) # avg of word embedding n.embed_avg = L.Convolution(n.embed_scale_resh, convolution_param={ 'kernel_size': 1, 'num_output': 1, 'bias_term': False, 'weight_filler': dict(type='constant', value=1) }, param=dict(lr_mult=0, decay_mult=0)) # N x 1 x d_w x 1 n.embed_avg_resh = L.Reshape( n.embed_avg, reshape_param=dict(shape=dict(dim=[batchsize, embed_size, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.embed_avg_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, max_words_in_question, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \ module='vqa_data_provider_layer', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # T x N -> T x N x 300 n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=max_words_in_question, slice_param={'axis': 0}) for i in xrange(max_words_in_question - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[max_words_in_question - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_drop = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_drop = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_drop, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=max_words_in_question, slice_param={'axis': 0}) for i in xrange(max_words_in_question - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[max_words_in_question - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_drop = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_lstms = [n.lstm1_reshaped_drop, n.lstm2_reshaped_drop] n.lstm_12 = L.Concat(*concat_lstms) n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12, \ reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature, \ reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh, n.i_emb_tanh_droped_resh, \ compact_bilinear_param=dict(num_output=16000,sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.blcf_droped_resh = L.Reshape( n.blcf_droped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.blcf_droped_resh, num_output=config.NUM_OUTPUT_UNITS, \ weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4)#5 ) # # word embedding (static + dynamic) # n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ # weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # n.embed_scale_resh = L.Reshape(n.embed_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0))) # n.glove_scale_resh = L.Reshape(n.glove_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300 # char embedding n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=50, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape(n.embed_scale,\ reshape_param=dict(\ shape=dict(dim=[batchsize,1,T,50]))) # char deep convolution n.char_conv_1 = L.Convolution( n.embed_scale_resh, kernel_h=5, kernel_w=50, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 1 x 100 x 50 -> N x 256 x 96 x 1 n.char_relu_1 = L.ReLU(n.char_conv_1) n.char_pool_1 = L.Pooling( n.char_relu_1, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 96 x 1 -> N x 256 x 48 x 1 n.char_conv_2 = L.Convolution( n.char_pool_1, kernel_h=5, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 48 x 1 -> N x 256 x 44 x 1 n.char_relu_2 = L.ReLU(n.char_conv_2) n.char_pool_2 = L.Pooling( n.char_relu_2, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 44 x 1 -> N x 256 x 22 x 1 n.char_conv_3 = L.Convolution( n.char_pool_2, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 22 x 1 -> N x 256 x 20 x 1 n.char_relu_3 = L.ReLU(n.char_conv_3) n.char_conv_4 = L.Convolution( n.char_relu_3, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 20 x 1 -> N x 256 x 18 x 1 n.char_relu_4 = L.ReLU(n.char_conv_4) n.char_conv_5 = L.Convolution( n.char_relu_4, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 18 x 1 -> N x 256 x 16 x 1 n.char_relu_5 = L.ReLU(n.char_conv_5) n.char_pool_3 = L.Pooling( n.char_relu_5, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 16 x 1 -> N x 256 x 8 x 1 n.vec_reshape = L.Reshape( n.char_pool_3, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.concat_vec_dropped = L.Dropout(n.vec_reshape, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()