示例#1
0
    def bilinear_distance(self, vec1, vec2):
        reshape_vec1 = L.Reshape(vec1,
                                 shape=dict(dim=[self.batch_size, -1, 1, 1]))
        reshape_vec2 = L.Reshape(vec2,
                                 shape=dict(dim=[self.batch_size, -1, 1, 1]))
        bilinear = L.CompactBilinear(reshape_vec1, reshape_vec2)
        signed = L.SignedSqrt(bilinear)
        l2_normalize = L.L2Normalize(signed)
        score = L.InnerProduct(
            l2_normalize,
            num_output=1,
            weight_filler=self.uniform_weight_filler(-0.08, .08),
            param=self.learning_params([[1, 1], [2, 0]],
                                       ['bilinear_dist', 'bilinear_dist_b']))

        return score
示例#2
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
示例#3
0
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 )

    # char embedding
    n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \
         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_c_scale = L.Scale(n.embed_c,
                              n.cont1,
                              scale_param=dict(dict(axis=0)))
    n.embed_c_scale_resh = L.Reshape(
        n.embed_c_scale,
        reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c *
                                           T, -1])))  # N x 1 x T_c x d_c
    tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2})
    for i in xrange(T):
        n.__setattr__('slice_' + str(i + 1), tops[int(i)])

    # char conv
    n.c_feature_1 = L.Convolution(
        n.slice_1,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_2 = L.Convolution(
        n.slice_2,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_3 = L.Convolution(
        n.slice_3,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_4 = L.Convolution(
        n.slice_4,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_5 = L.Convolution(
        n.slice_5,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_6 = L.Convolution(
        n.slice_6,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_7 = L.Convolution(
        n.slice_7,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_8 = L.Convolution(
        n.slice_8,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_9 = L.Convolution(
        n.slice_9,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_10 = L.Convolution(
        n.slice_10,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_11 = L.Convolution(
        n.slice_11,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_12 = L.Convolution(
        n.slice_12,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_13 = L.Convolution(
        n.slice_13,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_14 = L.Convolution(
        n.slice_14,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_15 = L.Convolution(
        n.slice_15,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_16 = L.Convolution(
        n.slice_16,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_17 = L.Convolution(
        n.slice_17,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_18 = L.Convolution(
        n.slice_18,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_19 = L.Convolution(
        n.slice_19,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_20 = L.Convolution(
        n.slice_20,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_21 = L.Convolution(
        n.slice_21,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_22 = L.Convolution(
        n.slice_22,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])

    n.c_vec_1 = L.Pooling(n.c_feature_1,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_2 = L.Pooling(n.c_feature_2,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_3 = L.Pooling(n.c_feature_3,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_4 = L.Pooling(n.c_feature_4,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_5 = L.Pooling(n.c_feature_5,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_6 = L.Pooling(n.c_feature_6,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_7 = L.Pooling(n.c_feature_7,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_8 = L.Pooling(n.c_feature_8,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_9 = L.Pooling(n.c_feature_9,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_10 = L.Pooling(n.c_feature_10,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_11 = L.Pooling(n.c_feature_11,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_12 = L.Pooling(n.c_feature_12,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_13 = L.Pooling(n.c_feature_13,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_14 = L.Pooling(n.c_feature_14,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_15 = L.Pooling(n.c_feature_15,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_16 = L.Pooling(n.c_feature_16,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_17 = L.Pooling(n.c_feature_17,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_18 = L.Pooling(n.c_feature_18,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_19 = L.Pooling(n.c_feature_19,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_20 = L.Pooling(n.c_feature_20,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_21 = L.Pooling(n.c_feature_21,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_22 = L.Pooling(n.c_feature_22,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)

    n.c_embed_1 = L.Reshape(
        n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_2 = L.Reshape(
        n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_3 = L.Reshape(
        n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_4 = L.Reshape(
        n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_5 = L.Reshape(
        n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_6 = L.Reshape(
        n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_7 = L.Reshape(
        n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_8 = L.Reshape(
        n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_9 = L.Reshape(
        n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_10 = L.Reshape(
        n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_11 = L.Reshape(
        n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_12 = L.Reshape(
        n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_13 = L.Reshape(
        n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_14 = L.Reshape(
        n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_15 = L.Reshape(
        n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_16 = L.Reshape(
        n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_17 = L.Reshape(
        n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_18 = L.Reshape(
        n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_19 = L.Reshape(
        n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_20 = L.Reshape(
        n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_21 = L.Reshape(
        n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_22 = L.Reshape(
        n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))

    concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\
     n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22]
    n.concat_char_embed = L.Concat(*concat_c_embed,
                                   concat_param={'axis': 1})  # N x T x d_c

    # word embedding
    n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w

    # combine word and char embedding
    concat_word_embed = [n.embed_w, n.concat_char_embed]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # N x T x (d_c+d_w)

    n.embed_scale = L.Scale(n.concat_embed,
                            n.cont,
                            scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(
        n.embed_scale,
        reshape_param=dict(shape=dict(
            dim=[batchsize, 1, T, -1])))  # N x 1 x T x (d_c+d_w)

    # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0)))
    # n.glove_scale_resh = L.Reshape(n.glove_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh]
    # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300

    # convolution
    n.word_feature_2 = L.Convolution(
        n.embed_scale_resh,
        kernel_h=2,
        kernel_w=300,
        stride=1,
        num_output=512,
        pad_h=1,
        pad_w=0,
        weight_filler=dict(type='xavier'))  # N x C x ? x 1
    n.word_feature_3 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=3,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=2,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_4 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=4,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=3,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_5 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=5,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=4,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_relu_2 = L.ReLU(n.word_feature_2)
    n.word_relu_3 = L.ReLU(n.word_feature_3)
    n.word_relu_4 = L.ReLU(n.word_feature_4)
    n.word_relu_5 = L.ReLU(n.word_feature_5)
    n.word_vec_2 = L.Pooling(n.word_relu_2,
                             kernel_h=T + 1,
                             kernel_w=1,
                             stride=T + 1,
                             pool=P.Pooling.MAX)  # N x C x 1 x 1
    n.word_vec_3 = L.Pooling(n.word_relu_3,
                             kernel_h=T + 2,
                             kernel_w=1,
                             stride=T + 2,
                             pool=P.Pooling.MAX)
    n.word_vec_4 = L.Pooling(n.word_relu_4,
                             kernel_h=T + 3,
                             kernel_w=1,
                             stride=T + 3,
                             pool=P.Pooling.MAX)
    n.word_vec_5 = L.Pooling(n.word_relu_5,
                             kernel_h=T + 4,
                             kernel_w=1,
                             stride=T + 4,
                             pool=P.Pooling.MAX)
    word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
    n.concat_vec = L.Concat(*word_vec, concat_param={'axis':
                                                     1})  # N x 4C x 1 x 1
    n.concat_vec_dropped = L.Dropout(n.concat_vec,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size):

    #prototxt 없이 network 생성시 사용
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})

    #지정된 Python 모듈 형식
    #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe
    #해당 클래스를 바탕으로 Layer를 생성하며
    #리턴된 변수에 값을 채워넣으면 자동으로 Run된다.
    #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐.

    #Glove = Global vectors for word representation
    #https://www.aclweb.org/anthology/D14-1162
    #Pretrained 된 GloveVector를 Concat에 사용.

    #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector.

    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    #module = python 파일이름
    #layer = layer형식이 맞춰진 python class
    #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다
    #ntop = 각 setup , forward backward의 top 변수의 크기

    #보통 textual Embed의 뜻은 => texture -> number
    #Embed 3000개의 Vector종류를
    #300개로 compact하게 표현함
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    #Tanh 적용
    n.embed = L.TanH(n.embed_ba)
    #Glove Data와 Concat
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})

    #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조.
    # give top2[~] the name specified by argument `slice_second`
    #변수 부여 기능
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))

    #마지막 LSTM output을 사용.
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    #lstm1의 output => 1024 reshape뒤 dropout
    #lstm2의 output => 1024 reshape뒤 dropout
    #concat

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1)  =>2048,14
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)

    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))

    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    #논문 그림과 달리 Dropout 추가
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    #논문 그림과 달리 output dim이 2
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    #softmax로 attentionmap 생성
    #14x14 Softmax map이 2개 생성

    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    #두가지 att_map을 각각 Slice
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
    #각각 ATT를 곱한값을 연산뒤 Concat한다.

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #그뒤 4096으로 Reshape

    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))

    #논문과 달리 가로축 세로축 inputVector크기가 다름
    #논문 2048 2048
    #코드 4096 2048
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    #SignedSqrt
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    #L2_Normalize
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    #Dropout
    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    #FullyConnected
    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))

    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
示例#5
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
    #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )

    # word embedding
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    # n.embed = L.TanH(n.embed_ba)
    n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(n.embed_scale,\
                          reshape_param=dict(\
                              shape=dict(dim=[batchsize,1,T,300])))

    # Convolution
    n.word_feature_2 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=2,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=1,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_2_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=2,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=1,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_3 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=3,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=2,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_3_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=3,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=2,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_4 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=4,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=3,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_4_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=4,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=3,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_5 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=5,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=4,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_5_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=5,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=4,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))

    n.word_2_acti = L.TanH(n.word_feature_2)
    n.word_3_acti = L.TanH(n.word_feature_3)
    n.word_4_acti = L.TanH(n.word_feature_4)
    n.word_5_acti = L.TanH(n.word_feature_5)

    n.word_2_gate = L.Sigmoid(n.word_feature_2_g)
    n.word_3_gate = L.Sigmoid(n.word_feature_3_g)
    n.word_4_gate = L.Sigmoid(n.word_feature_4_g)
    n.word_5_gate = L.Sigmoid(n.word_feature_5_g)

    n.word_2 = L.Eltwise(n.word_2_acti,
                         n.word_2_gate,
                         operation=P.Eltwise.PROD)
    n.word_3 = L.Eltwise(n.word_3_acti,
                         n.word_3_gate,
                         operation=P.Eltwise.PROD)
    n.word_4 = L.Eltwise(n.word_4_acti,
                         n.word_4_gate,
                         operation=P.Eltwise.PROD)
    n.word_5 = L.Eltwise(n.word_5_acti,
                         n.word_5_gate,
                         operation=P.Eltwise.PROD)

    n.word_vec_2 = L.Pooling(n.word_2,
                             kernel_h=T + 1,
                             kernel_w=1,
                             stride=T + 1,
                             pool=P.Pooling.MAX)
    n.word_vec_3 = L.Pooling(n.word_3,
                             kernel_h=T + 2,
                             kernel_w=1,
                             stride=T + 2,
                             pool=P.Pooling.MAX)
    n.word_vec_4 = L.Pooling(n.word_4,
                             kernel_h=T + 3,
                             kernel_w=1,
                             stride=T + 3,
                             pool=P.Pooling.MAX)
    n.word_vec_5 = L.Pooling(n.word_5,
                             kernel_h=T + 4,
                             kernel_w=1,
                             stride=T + 4,
                             pool=P.Pooling.MAX)

    word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
    n.concat_vec = L.Concat(*word_vec, concat_param={'axis':
                                                     1})  # N x 4*d_w x 1 x 1

    n.concat_vec_dropped = L.Dropout(n.concat_vec,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
示例#6
0
def qlstm(mode, batchsize, T, question_vocab_size, embed_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )

    # word embedding (static + dynamic)
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_scale = L.Scale(n.embed_ba, n.cont,
                            scale_param=dict(dict(axis=0)))  # N x T x d_w
    n.embed_scale_resh = L.Reshape(
        n.embed_scale,
        reshape_param=dict(shape=dict(dim=[batchsize, T, embed_size, 1])))

    # avg of word embedding
    n.embed_avg = L.Convolution(n.embed_scale_resh,
                                convolution_param={
                                    'kernel_size': 1,
                                    'num_output': 1,
                                    'bias_term': False,
                                    'weight_filler': dict(type='constant',
                                                          value=1)
                                },
                                param=dict(lr_mult=0,
                                           decay_mult=0))  # N x 1 x d_w x 1
    n.embed_avg_resh = L.Reshape(
        n.embed_avg,
        reshape_param=dict(shape=dict(dim=[batchsize, embed_size, 1, 1])))

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.embed_avg_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
示例#7
0
def qlstm(mode, batchsize, max_words_in_question, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', \
        param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # T x N -> T x N x 300
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1,
                    ntop=max_words_in_question,
                    slice_param={'axis': 0})
    for i in xrange(max_words_in_question - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[max_words_in_question - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_drop = L.Dropout(n.lstm1_reshaped,
                                      dropout_param={'dropout_ratio': 0.3})
    n.lstm1_drop = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})

    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_drop, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2,
                    ntop=max_words_in_question,
                    slice_param={'axis': 0})
    for i in xrange(max_words_in_question - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[max_words_in_question - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_drop = L.Dropout(n.lstm2_reshaped,
                                      dropout_param={'dropout_ratio': 0.3})
    concat_lstms = [n.lstm1_reshaped_drop, n.lstm2_reshaped_drop]
    n.lstm_12 = L.Concat(*concat_lstms)

    n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12, \
        reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature, \
        reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh, n.i_emb_tanh_droped_resh, \
        compact_bilinear_param=dict(num_output=16000,sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)

    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})
    n.blcf_droped_resh = L.Reshape(
        n.blcf_droped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.blcf_droped_resh, num_output=config.NUM_OUTPUT_UNITS, \
        weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
示例#8
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4)#5 )

    # # word embedding (static + dynamic)
    # n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
    #     weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    # n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    # n.embed_scale_resh = L.Reshape(n.embed_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0)))
    # n.glove_scale_resh = L.Reshape(n.glove_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh]
    # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300

    # char embedding
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=50, \
         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(n.embed_scale,\
                           reshape_param=dict(\
                               shape=dict(dim=[batchsize,1,T,50])))

    # char deep convolution
    n.char_conv_1 = L.Convolution(
        n.embed_scale_resh,
        kernel_h=5,
        kernel_w=50,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 1 x 100 x 50 -> N x 256 x 96 x 1
    n.char_relu_1 = L.ReLU(n.char_conv_1)
    n.char_pool_1 = L.Pooling(
        n.char_relu_1, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 96 x 1 -> N x 256 x 48 x 1
    n.char_conv_2 = L.Convolution(
        n.char_pool_1,
        kernel_h=5,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 48 x 1 -> N x 256 x 44 x 1
    n.char_relu_2 = L.ReLU(n.char_conv_2)
    n.char_pool_2 = L.Pooling(
        n.char_relu_2, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 44 x 1 -> N x 256 x 22 x 1
    n.char_conv_3 = L.Convolution(
        n.char_pool_2,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 22 x 1 -> N x 256 x 20 x 1
    n.char_relu_3 = L.ReLU(n.char_conv_3)
    n.char_conv_4 = L.Convolution(
        n.char_relu_3,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 20 x 1 -> N x 256 x 18 x 1
    n.char_relu_4 = L.ReLU(n.char_conv_4)
    n.char_conv_5 = L.Convolution(
        n.char_relu_4,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 18 x 1 -> N x 256 x 16 x 1
    n.char_relu_5 = L.ReLU(n.char_conv_5)
    n.char_pool_3 = L.Pooling(
        n.char_relu_5, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 16 x 1 -> N x 256 x 8 x 1
    n.vec_reshape = L.Reshape(
        n.char_pool_3, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.concat_vec_dropped = L.Dropout(n.vec_reshape,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()