def MeanFieldUpdate(n, bottom_send, bottom_receive, feat_ind, mf_iter,
                    feat_num):
    '''
    Meanfield updating for the features and the attention for one pair of features.
    bottom_list is a list of observation features derived from the backbone CNN.
    '''
    #generating an attention map
    concat_f = 'concat_f{}_mf{}'.format(feat_ind, mf_iter)
    conv_f = 'conv_f{}_mf{}'.format(feat_ind, mf_iter)
    atten_f = 'atten_f{}_mf{}'.format(feat_ind, mf_iter)
    norm_atten_f = 'norm_atten_f{}_mf{}'.format(feat_ind, mf_iter)
    norm_atten_f_tile = 'norm_atten_f_tile{}_mf{}'.format(feat_ind, mf_iter)
    message_f = 'message_f{}_mf{}'.format(feat_ind, mf_iter)
    filter_message_f = 'filter_message_f{}_mf{}'.format(feat_ind, mf_iter)
    message_scaled = 'message_scaled_f{}_mf{}'.format(feat_ind, mf_iter)
    updated_f = 'updated_f{}_mf{}'.format(feat_ind, mf_iter)

    n[concat_f] = L.Concat(bottom_send, bottom_receive)
    #specify parameter names to make them share between different meanfield updating
    n[atten_f] = L.Convolution(n[concat_f],
                               num_output=1,
                               kernel_size=3,
                               stride=1,
                               pad=1,
                               param=[
                                   dict(name='atten_f{}_w'.format(feat_ind),
                                        lr_mult=1,
                                        decay_mult=1),
                                   dict(name='atten_f{}_b'.format(feat_ind),
                                        lr_mult=2,
                                        decay_mult=0)
                               ])
    n[norm_atten_f] = L.Sigmoid(n[atten_f])
    n[norm_atten_f_tile] = L.Tile(net[norm_atten_f],
                                  tile_param=dict(axis=1, tiles=feat_num))
    n[message_f] = L.Convolution(
        bottom_send,
        num_output=feat_num,
        kernel_size=3,
        stride=1,
        pad=1,
        param=[
            dict(name='message_f{}_w'.format(feat_ind),
                 lr_mult=1,
                 decay_mult=1),
            dict(name='message_f{}_b'.format(feat_ind),
                 lr_mult=2,
                 decay_mult=0)
        ])
    n[filter_message_f] = L.Eltwise(n[message_f],
                                    n[norm_atten_f_tile],
                                    operation=P.Eltwise.PROD)
    #scale the messages before adding
    n[message_scaled] = L.Scale(n[filter_message_f],
                                bias_term=True,
                                in_place=True)
    n[updated_f] = L.Eltwise(bottom_receive,
                             n[message_scaled],
                             operation=P.Eltwise.SUM)
def decode_features(pixel_spixel_assoc, spixel_feat, spixel_init,
                    num_spixels_h, num_spixels_w, num_spixels, num_channels):

    num_channels = int(num_channels)

    # Reshape superpixel features to k_h x k_w
    spixel_feat_reshaped = L.Reshape(
        spixel_feat,
        reshape_param=dict(
            shape={'dim': [0, 0, num_spixels_h, num_spixels_w]}))

    # Concatenate neighboring superixel features
    concat_spixel_feat = L.Convolution(
        spixel_feat_reshaped,
        name='concat_spixel_feat_' + str(num_channels),
        convolution_param=dict(num_output=num_channels * 9,
                               kernel_size=3,
                               stride=1,
                               pad=1,
                               group=num_channels,
                               bias_term=False),
        param=[{
            'name': 'concat_spixel_feat_' + str(num_channels),
            'lr_mult': 0,
            'decay_mult': 0
        }])

    # Spread features to pixels
    flat_concat_label = L.Reshape(
        concat_spixel_feat,
        reshape_param=dict(shape={'dim': [0, 0, 1, num_spixels]}))
    img_concat_spixel_feat = L.Smear(flat_concat_label, spixel_init)

    tiled_assoc = L.Tile(pixel_spixel_assoc,
                         tile_param=dict(tiles=num_channels))

    weighted_spixel_feat = L.Eltwise(
        img_concat_spixel_feat,
        tiled_assoc,
        eltwise_param=dict(operation=P.Eltwise.PROD))
    recon_feat = L.Convolution(weighted_spixel_feat,
                               name='recon_feat_' + str(num_channels),
                               convolution_param=dict(num_output=num_channels,
                                                      kernel_size=1,
                                                      stride=1,
                                                      pad=0,
                                                      group=num_channels,
                                                      bias_term=False,
                                                      weight_filler=dict(
                                                          type='constant',
                                                          value=1.0)),
                               param=[{
                                   'name': 'recon_feat_' + str(num_channels),
                                   'lr_mult': 0,
                                   'decay_mult': 0
                               }])

    return recon_feat
Пример #3
0
    def l2normed(self,vec, dim):
        #Returns L2-normalized instances of vec; i.e., for each instance x in vec,
        #computes  x / ((x ** 2).sum() ** 0.5). Assumes vec has shape N x dim."""
        denom = L.Reduction(vec, axis=1, operation=P.Reduction.SUMSQ)
        denom = L.Power(denom, power=(-0.5), shift=1e-12)
        denom = L.Reshape(denom, num_axes=0, axis=-1, shape=dict(dim=[1]))
        denom = L.Tile(denom, axis=1, tiles=dim)

        return L.Eltwise(vec, denom, operation=P.Eltwise.PROD)
Пример #4
0
 def normalize(self, bottom, axis=1, numtiles=4096):
     power = L.Power(bottom, power=2)
     power_sum = L.Reduction(power, axis=axis, operation=1)
     sqrt = L.Power(power_sum, power=-0.5, shift=0.00001)
     if axis == 1:
         reshape = L.Reshape(sqrt, shape=dict(dim=[-1, 1]))
     if axis == 2:
         reshape = L.Reshape(sqrt, shape=dict(dim=[self.batch_size, -1, 1]))
     tile = L.Tile(reshape, axis=axis, tiles=numtiles)
     return L.Eltwise(tile, bottom, operation=0)
Пример #5
0
    def build_relational_model_deploy(self, save_tag, visual_feature_dim,
                                      language_feature_dim):

        image_input = L.DummyData(
            shape=[dict(dim=[21, 1, visual_feature_dim + 2])], ntop=1)
        setattr(self.n, 'image_data', image_input)

        image_global = L.DummyData(
            shape=[dict(dim=[21, 21, visual_feature_dim + 2])], ntop=1)
        setattr(self.n, 'global_data', image_global)

        im_model, lang_model = self.get_models()

        self.silence_count += 1

        bottom_tile = L.Tile(image_input, axis=1, tiles=21)

        bottom_concat = L.Concat(bottom_tile, image_global, axis=2)
        bottom_visual = im_model(bottom_concat, axis=2)

        text_input = L.DummyData(shape=[
            dict(
                dim=[self.params['sentence_length'], 21, language_feature_dim])
        ],
                                 ntop=1)
        setattr(self.n, 'text_data', text_input)
        cont_input = L.DummyData(
            shape=[dict(dim=[self.params['sentence_length'], 21])], ntop=1)
        setattr(self.n, 'cont_data', cont_input)
        bottom_text = lang_model(text_input, cont_input)

        t_reshape = L.Reshape(bottom_text,
                              shape=dict(dim=[self.batch_size, 1, -1]))
        t_tile = L.Tile(t_reshape, axis=1, tiles=21)

        self.n.tops['scores'] = self.distance_function(bottom_visual,
                                                       t_tile)[0]

        self.write_net(save_tag, self.n)
Пример #6
0
def normalize(bottom, dim):

    bottom_relu = L.ReLU(bottom)
    sum = L.Convolution(bottom_relu,
                        convolution_param = dict(num_output = 1, kernel_size = 1, stride = 1,
                                                 weight_filler = dict(type = 'constant', value = 1),
                                                 bias_filler = dict(type = 'constant', value = 0)),
                        param=[{'lr_mult':0, 'decay_mult':0}, {'lr_mult':0, 'decay_mult':0}])

    denom = L.Power(sum, power=(-1.0), shift=1e-12)
    denom = L.Tile(denom, axis=1, tiles=dim)

    return L.Eltwise(bottom_relu, denom, operation=P.Eltwise.PROD)
Пример #7
0
def exp_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.exp_att_feature, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='exp_data_provider_layer', layer='ExpDataProviderLayer', param_str=mode_str, ntop=5)

    n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_embed = L.TanH(n.exp_embed_ba)

    # LSTM1 for Explanation
    n.exp_lstm1 = L.LSTM(\
                   n.exp_embed, n.exp_cont_1,\
                   recurrent_param=dict(\
                       num_output=2048,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))

    n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,
                                    dropout_param={'dropout_ratio': 0.3})

    # Merge with LSTM1 for explanation
    n.exp_att_resh = L.Reshape(
        n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048])))
    n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T)
    n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped,
                                  n.exp_att_tiled,
                                  eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all)
    n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt)
    n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2,
                                       dropout_param={'dropout_ratio': 0.3})

    # LSTM2 for Explanation
    n.exp_lstm2 = L.LSTM(\
                   n.exp_eltwise_all_drop, n.exp_cont_2,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,
                                    dropout_param={'dropout_ratio': 0.3})

    n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped,
                                      num_output=exp_vocab_size,
                                      weight_filler=dict(type='xavier'),
                                      axis=2)

    n.silence_exp_prediction = L.Silence(n.exp_prediction, ntop=0)

    return n.to_proto()
def l2normed(dim):
    n = caffe.NetSpec()
    n.data, n.label = L.Python(module='layers',
                               layer='tripletDataLayer',
                               ntop=2)
    """Returns L2-normalized instances of vec; i.e., for each instance x in vec,
    computes  x / ((x ** 2).sum() ** 0.5). Assumes vec has shape N x dim."""
    n.denom = L.Reduction(n.data, axis=1, operation=P.Reduction.SUMSQ)
    #denom = L.Power(denom, power=(-0.5))
    n.power = L.Power(n.denom, power=(-0.5),
                      shift=1e-12)  # For numerical stability
    n.reshape = L.Reshape(n.power, num_axes=0, axis=-1, shape=dict(dim=[1]))
    n.tile = L.Tile(n.reshape, axis=1, tiles=dim)
    n.elwise = L.Eltwise(n.data, n.tile, operation=P.Eltwise.PROD)
    return n.to_proto()
Пример #9
0
def concat(n, q_layer, v_layer):
    # input: q_layer:(N,1024)   v_layer:(N,100,2053)
    n.q_emb_resh1 = L.Reshape(
        q_layer, reshape_param=dict(shape=dict(dim=[0, 1, cfg.RNN_DIM])))
    n.q_emb_tile = L.Tile(n.q_emb_resh1, axis=1, tiles=cfg.RPN_TOPN)
    n.q_emb_resh = L.Reshape(
        n.q_emb_tile, reshape_param=dict(shape=dict(dim=[-1, cfg.RNN_DIM])))

    n.v_emb_resh = L.Reshape(
        v_layer,
        reshape_param=dict(shape=dict(
            dim=[-1, cfg.SPT_FEAT_DIM + cfg.BOTTOMUP_FEAT_DIM])))
    n.qv_fuse = L.Concat(n.q_emb_resh, n.v_emb_resh, concat_param={'axis': 1})
    n.qv_fc1 = L.InnerProduct(n.qv_fuse,
                              num_output=512,
                              weight_filler=dict(type='xavier'))
    n.qv_relu = L.ReLU(n.qv_fc1)
    return n.qv_relu
Пример #10
0
def mask_unit(net,input_name,idx,feature_dim,each_dim):
    #map_num att_map
    net['mask_conv'+idx]=L.Convolution(net[input_name],kernel_size=1,num_output=1, \
                              param = [dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)],\
                              weight_filler=dict(type="xavier", variance_norm=2), \
                              bias_filler=dict(type="constant"))
    #input ~ (-1,1), rescale to range (0,1)
    net['mask_map'+idx]=L.Sigmoid(net['mask_conv'+idx])
    net['tile_map'+idx]=L.Tile(net['mask_map'+idx],tile_param=dict(tiles=feature_dim))
    net['masked'+idx]=L.Eltwise(net[input_name],net['tile_map'+idx],\
                                 eltwise_param=dict(operation=0))
    net['pooled'+idx]=L.Pooling(net['masked'+idx],pooling_param=dict(pool=1,global_pooling=1))
    net['linear'+idx]=L.InnerProduct(net['pooled'+idx], num_output=each_dim, \
                                    param = [dict(lr_mult=1, decay_mult=1),  \
                                             dict(lr_mult=2, decay_mult=0)],
                                    weight_filler=dict(type="xavier"),
                                    bias_filler=dict(type="constant"))
    return net['linear'+idx]
Пример #11
0
def SqeezeExcitationLayer(caffe_net, layer_idx, bottom_blob, in_channel, reduced_ch, height, width, bias_term=False):
    names = ['gPool{}'.format(layer_idx),
             'fc{}a'.format(layer_idx),
             'fc{}a_relu'.format(layer_idx),
             'fc{}b'.format(layer_idx),
             'fc{}b_sigmoid'.format(layer_idx),
             'tile{}'.format(layer_idx),
             'reshape{}'.format(layer_idx),
             'eltwise{}'.format(layer_idx),
            ]

    start_bottom_blob = bottom_blob

    caffe_net[names[0]] = L.Pooling(bottom_blob, pool=P.Pooling.AVE, global_pooling=True)
    caffe_net[names[1]] = L.InnerProduct(caffe_net[names[0]], num_output=reduced_ch, bias_term=bias_term)
    caffe_net[names[2]] = L.ReLU(caffe_net[names[1]], in_place=True)
    caffe_net[names[3]] = L.InnerProduct(caffe_net[names[2]], num_output=in_channel, bias_term=bias_term)
    caffe_net[names[4]] = L.Sigmoid(caffe_net[names[3]])
    
    caffe_net[names[5]] = L.Tile(caffe_net[names[4]], axis = 1, tiles = height*width)
    caffe_net[names[6]] = L.Reshape(caffe_net[names[5]], reshape_param={'shape':{'dim': [0, in_channel, height, width]}})
    caffe_net[names[7]] = L.Eltwise(caffe_net[names[6]], start_bottom_blob, operation=P.Eltwise.PROD )
    
    return caffe_net[names[7]], layer_idx + 1
Пример #12
0
def generate_model(split, config):
    n = caffe.NetSpec()
    batch_size = config.N
    mode_str = str(dict(split=split, batch_size=batch_size))
    n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider,
                                                               layer=config.data_provider_layer,
                                                               param_str=mode_str,
                                                               ntop=5)

    # the base net (VGG-16)
    n.conv1_1, n.relu1_1 = conv_relu(n.image, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool1 = max_pool(n.relu1_2)

    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool2 = max_pool(n.relu2_2)

    n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool3 = max_pool(n.relu3_3)

    n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool4 = max_pool(n.relu4_3)

    n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool5 = max_pool(n.relu5_3)

    # fully conv
    n.fcn_fc6, n.fcn_relu6 = conv_relu(n.pool5, 4096, ks=7, pad=3)
    if config.vgg_dropout:
        n.fcn_drop6 = L.Dropout(n.fcn_relu6, dropout_ratio=0.5, in_place=True)
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_drop6, 4096, ks=1, pad=0)
        n.fcn_drop7 = L.Dropout(n.fcn_relu7, dropout_ratio=0.5, in_place=True)
        n.fcn_fc8 = conv(n.fcn_drop7, 1000, ks=1, pad=0)
    else:
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_relu6, 4096, ks=1, pad=0)
        n.fcn_fc8 = conv(n.fcn_relu7, 1000, ks=1, pad=0)

    
    # embedding
    n.embed = L.Embed(n.language, input_dim=config.vocab_size,
                      num_output=config.embed_dim,
                      weight_filler=dict(type='uniform', min=-0.08, max=0.08))

    # LSTM
    n.lstm = L.LSTM(n.embed, n.cont,
                    recurrent_param=dict(num_output=config.lstm_dim,
                                         weight_filler=dict(type='uniform', min=-0.08, max=0.08),
                                         bias_filler=dict(type='constant', value=0)))
    tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0))
    for i in range(config.T - 1):
        n.__setattr__('slice'+str(i), tops[i])
        n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0))
    n.lstm_out = tops[-1]
    n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim])))

    # Tile LSTM feature
    n.lstm_resh = L.Reshape(n.lstm_feat, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim, 1, 1])))
    n.lstm_tile_1 = L.Tile(n.lstm_resh, axis=2, tiles=config.featmap_H)
    n.lstm_tile_2 = L.Tile(n.lstm_tile_1, axis=3, tiles=config.featmap_W)

    # L2 Normalize image and language features
    n.img_l2norm = L.L2Normalize(n.fcn_fc8)
    n.lstm_l2norm = L.L2Normalize(n.lstm_tile_2)

    # Concatenate
    n.feat_all = L.Concat(n.lstm_l2norm, n.img_l2norm, n.spatial, concat_param=dict(axis=1))

    # MLP Classifier over concatenated feature
    n.fcn_l1, n.fcn_relu1 = conv_relu(n.feat_all, config.mlp_hidden_dims, ks=1, pad=0)
    if config.mlp_dropout:
        n.fcn_drop1 = L.Dropout(n.fcn_relu1, dropout_ratio=0.5, in_place=True)
        n.fcn_scores = conv(n.fcn_drop1, 1, ks=1, pad=0)
    else:
        n.fcn_scores = conv(n.fcn_relu1, 1, ks=1, pad=0)
    
    # Loss Layer
    n.loss = L.SigmoidCrossEntropyLoss(n.fcn_scores, n.label)

    return n.to_proto()
def act_proto(mode, batchsize, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='activity_data_provider_layer', layer='ActivityDataProviderLayer', param_str=mode_str, ntop=6)

    # Attention
    n.att_conv1 = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=1,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,
                          reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048])))

    # Prediction
    n.prediction = L.InnerProduct(n.att_feature_resh,
                                  num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'),
                                  param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label,
                                input_dim=config.NUM_OUTPUT_UNITS,
                                num_output=300,
                                weight_filler=dict(type='uniform',
                                                   min=-0.08,
                                                   max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans,
                                input_dim=config.NUM_OUTPUT_UNITS,
                                num_output=300,
                                weight_filler=dict(type='uniform',
                                                   min=-0.08,
                                                   max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh,
                                    num_output=2048,
                                    weight_filler=dict(type='xavier'))

    # Merge activity answer and visual feature
    n.exp_emb_resh = L.Reshape(
        n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    n.img_embed = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=2048,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.img_embed,
                              n.exp_emb_tiled,
                              eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2,
                                   dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=512,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=1,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(
        n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(
        n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                            data_filler=dict(type='constant', value=1),
                            ntop=1)
    n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map,
                                             exp_dummy)
    n.exp_att_feature_resh = L.Reshape(
        n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh,
                                             num_output=2048,
                                             weight_filler=dict(type='xavier'))
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2,
                                  n.exp_att_feature_embed,
                                  eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
Пример #14
0
    def build_relational_model(self, param_str, save_tag):
        data = L.Python(module="data_processing",
                        layer=self.data_layer,
                        param_str=str(param_str),
                        ntop=self.top_size)
        for key, value in zip(self.params['top_names_dict'].keys(),
                              self.params['top_names_dict'].values()):
            setattr(self.n, key, data[value])

        im_model, lang_model = self.get_models()

        #bottoms which are always produced
        bottom_positive = data[self.top_name_dict['features_p']]
        bottom_negative = data[self.top_name_dict['features_n']]
        # 'global' is carryover name from MCN -- global == context moment here.
        global_positive = data[self.top_name_dict['features_global_p']]

        bottom_positive_tile = L.Tile(bottom_positive, axis=1, tiles=21)
        bottom_negative_tile = L.Tile(bottom_negative, axis=1, tiles=21)

        concat_positive = L.Concat(bottom_positive_tile,
                                   global_positive,
                                   axis=2)
        concat_negative = L.Concat(bottom_negative_tile,
                                   global_positive,
                                   axis=2)

        if self.inter:
            bottom_inter = data[self.top_name_dict['features_inter']]
            global_inter = data[self.top_name_dict['features_global_inter']]
            bottom_inter_tile = L.Tile(bottom_inter, axis=1, tiles=21)
            concat_inter = L.Concat(bottom_inter_tile, global_inter, axis=2)

        query = data[self.top_name_dict['BoG']]

        bottom_positive_feature = im_model(concat_positive, axis=2)
        bottom_negative_feature = im_model(concat_negative, axis=2)

        if self.inter:
            bottom_inter_feature = im_model(concat_inter, axis=2)

        #'cont' is for LSTM in Caffe -- would not need this if using average Glove features.
        cont = data[self.top_name_dict['cont']]
        query = lang_model(query, cont)

        t_reshape = L.Reshape(query, shape=dict(dim=[self.batch_size, 1, -1]))
        t_tile = L.Tile(t_reshape, axis=1, tiles=21)

        #loss function
        distance_p = self.distance_function(bottom_positive_feature, t_tile)
        distance_n = self.distance_function(bottom_negative_feature, t_tile)
        setattr(self.n, 'distance_p', distance_p[0])
        setattr(self.n, 'distance_n', distance_n[0])

        if self.inter:
            distance_inter = self.distance_function(bottom_inter_feature,
                                                    t_tile)
            setattr(self.n, 'distance_inter', distance_inter[0])
            self.n.tops['ranking_loss_inter'] = self.relational_ranking_loss(
                distance_p[0], distance_inter[0], lw=self.lw_inter)
        self.n.tops['ranking_loss_intra'] = self.relational_ranking_loss(
            distance_p[0], distance_n[0], lw=self.lw_intra)

        if self.args.strong_supervise:
            self.n.tops[
                'context_supervision_loss'] = self.context_supervision_loss(
                    distance_p[1],
                    lw=self.args.lw_strong_supervision,
                    ind_loss=data[
                        self.top_name_dict['strong_supervision_loss']])
        if self.args.stronger_supervise:
            #can also assert that the model needs to look at the correct context for the neg moment.
            self.n.tops[
                'negative_context_supervision_loss'] = self.context_supervision_loss(
                    distance_n[1],
                    lw=self.args.lw_strong_supervision,
                    ind_loss=data[
                        self.top_name_dict['strong_supervision_loss']])

        self.write_net(save_tag, self.n)
Пример #15
0
def pj_x(mode, batchsize, exp_T, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='activity_data_provider_layer',
                 layer='ActivityDataProviderLayer',
                 param_str=mode_str, ntop=6)

    # Attention
    n.att_conv1 = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=1,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,
                          reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048])))

    # Prediction
    n.prediction = L.InnerProduct(n.att_feature_resh,
                                  num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    n.accuracy = L.Accuracy(n.prediction, n.label)

    # Embed Activity GT answer during training
    n.exp_emb_ans = L.Embed(n.label, input_dim=config.NUM_OUTPUT_UNITS, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh,
                                    num_output=2048,
                                    weight_filler=dict(type='xavier'))

    # merge activity answer and visual feature
    n.exp_emb_resh = L.Reshape(
        n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    n.img_embed = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=2048,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.img_embed,
                              n.exp_emb_tiled,
                              eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2,
                                   dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=512,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=1,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(
        n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(
        n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                            data_filler=dict(type='constant', value=1),
                            ntop=1)
    n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map,
                                             exp_dummy)
    n.exp_att_feature_resh = L.Reshape(
        n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh,
                                             num_output=2048,
                                             weight_filler=dict(type='xavier'))
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2,
                                  n.exp_att_feature_embed,
                                  eltwise_param={'operation': P.Eltwise.PROD})

    # Embed explanation
    n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_embed = L.TanH(n.exp_embed_ba)

    # LSTM1 for Explanation
    n.exp_lstm1 = L.LSTM(\
                   n.exp_embed, n.exp_cont_1,\
                   recurrent_param=dict(\
                       num_output=2048,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))

    n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,
                                    dropout_param={'dropout_ratio': 0.3})

    # merge with LSTM1 for explanation
    n.exp_att_resh = L.Reshape(
        n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048])))
    n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T)
    n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped,
                                  n.exp_att_tiled,
                                  eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all)
    n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2,
                                       dropout_param={'dropout_ratio': 0.3})

    # LSTM2 for Explanation
    n.exp_lstm2 = L.LSTM(\
                   n.exp_eltwise_all_drop, n.exp_cont_2,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,
                                    dropout_param={'dropout_ratio': 0.3})

    n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped,
                                      num_output=exp_vocab_size,
                                      weight_filler=dict(type='xavier'),
                                      axis=2)

    n.exp_loss = L.SoftmaxWithLoss(n.exp_prediction,
                                   n.exp_out,
                                   loss_param=dict(ignore_label=-1),
                                   softmax_param=dict(axis=2))
    n.exp_accuracy = L.Accuracy(n.exp_prediction,
                                n.exp_out,
                                axis=2,
                                ignore_label=-1)

    return n.to_proto()
Пример #16
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4)#5 )

    # # word embedding (static + dynamic)
    # n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
    #     weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    # n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    # n.embed_scale_resh = L.Reshape(n.embed_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0)))
    # n.glove_scale_resh = L.Reshape(n.glove_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh]
    # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300

    # char embedding
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=50, \
         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(n.embed_scale,\
                           reshape_param=dict(\
                               shape=dict(dim=[batchsize,1,T,50])))

    # char deep convolution
    n.char_conv_1 = L.Convolution(
        n.embed_scale_resh,
        kernel_h=5,
        kernel_w=50,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 1 x 100 x 50 -> N x 256 x 96 x 1
    n.char_relu_1 = L.ReLU(n.char_conv_1)
    n.char_pool_1 = L.Pooling(
        n.char_relu_1, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 96 x 1 -> N x 256 x 48 x 1
    n.char_conv_2 = L.Convolution(
        n.char_pool_1,
        kernel_h=5,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 48 x 1 -> N x 256 x 44 x 1
    n.char_relu_2 = L.ReLU(n.char_conv_2)
    n.char_pool_2 = L.Pooling(
        n.char_relu_2, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 44 x 1 -> N x 256 x 22 x 1
    n.char_conv_3 = L.Convolution(
        n.char_pool_2,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 22 x 1 -> N x 256 x 20 x 1
    n.char_relu_3 = L.ReLU(n.char_conv_3)
    n.char_conv_4 = L.Convolution(
        n.char_relu_3,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 20 x 1 -> N x 256 x 18 x 1
    n.char_relu_4 = L.ReLU(n.char_conv_4)
    n.char_conv_5 = L.Convolution(
        n.char_relu_4,
        kernel_h=3,
        kernel_w=1,
        stride=1,
        num_output=256,
        weight_filler=dict(type='gaussian',
                           std=0.05))  # N x 256 x 18 x 1 -> N x 256 x 16 x 1
    n.char_relu_5 = L.ReLU(n.char_conv_5)
    n.char_pool_3 = L.Pooling(
        n.char_relu_5, kernel_h=2, kernel_w=1, stride=2,
        pool=P.Pooling.MAX)  # N x 256 x 16 x 1 -> N x 256 x 8 x 1
    n.vec_reshape = L.Reshape(
        n.char_pool_3, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.concat_vec_dropped = L.Dropout(n.vec_reshape,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Пример #17
0
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
            module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 )
    else:
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
            module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 
    concat_word_embed = [n.embed_tanh, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600

    # LSTM
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
    n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
            reshape_param=dict(shape=dict(dim=[0,0,0,1])))

    '''
    Question Attention
    '''
    n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
                                           weight_filler=dict(type='xavier'))
    n.qatt_relu = L.ReLU(n.qatt_conv1)
    n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
    n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)

    qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
    dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    qatt_feature_list = []
    for i in xrange(config.NUM_QUESTION_GLIMPSE):
        if config.NUM_QUESTION_GLIMPSE == 1:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
        else:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))    
        qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
    n.qatt_feat_concat = L.Concat(*qatt_feature_list) 
    '''
    Image Attention with MFB
    '''
    n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    
    n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 
                                   weight_filler=dict(type='xavier'))
    n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))  
    n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
    n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)


    n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
                                 weight_filler=dict(type='xavier')) 
    n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
                                                                      config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
    n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1])))
    n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
                                                                       config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
                              pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
    
    n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
    n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)


    ## 2 conv layers 1000 -> 512 -> 2
    n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 
                                weight_filler=dict(type='xavier'))
    n.iatt_relu = L.ReLU(n.iatt_conv1)
    n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
    n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
    n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    iatt_feature_list = []
    for i in xrange(config.NUM_IMG_GLIMPSE):
        if config.NUM_IMG_GLIMPSE == 1:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
        else:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
        n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
                                reshape_param=dict(shape=dict(dim=[0,-1]))))
        iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
    n.iatt_feat_concat = L.Concat(*iatt_feature_list)
    n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    
    '''
    Fine-grained Image-Question MFB fusion
    '''

    n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
    n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
    
    n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier')) 
    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
Пример #18
0
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8)

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights)
    n.embed = L.TanH(n.embed_ba) 

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[T-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[T-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    # Tile question feature
    n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14)
    n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14)

    # Embed image feature
    n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1,
                            num_output=2048, pad=0, weight_filler=dict(type='xavier'),
                            param=fixed_weights)

    # Eltwise product and normalization
    n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_sqrt = L.SignedSqrt(n.eltwise)
    n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt)
    n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for VQA
    n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.att_feature  = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048])))

    # eltwise product + normalization again for VQA
    n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2)
    n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt)
    n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3})

    n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier'))

    # Merge VQA answer and visual+textual feature
    n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    #n.exp_eltwise = L.Eltwise(n.eltwise_drop,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.eltwise_emb,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.exp_att_feature_prev  = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy)
    n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
Пример #19
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
    #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )

    # word embedding
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    # n.embed = L.TanH(n.embed_ba)
    n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(n.embed_scale,\
                          reshape_param=dict(\
                              shape=dict(dim=[batchsize,1,T,300])))

    # Convolution
    n.word_feature_2 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=2,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=1,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_2_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=2,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=1,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_3 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=3,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=2,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_3_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=3,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=2,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_4 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=4,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=3,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_4_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=4,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=3,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))
    n.word_feature_5 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=5,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=4,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_5_g = L.Convolution(n.embed_scale_resh,
                                       kernel_h=5,
                                       kernel_w=300,
                                       stride=1,
                                       num_output=512,
                                       pad_h=4,
                                       pad_w=0,
                                       weight_filler=dict(type='xavier'))

    n.word_2_acti = L.TanH(n.word_feature_2)
    n.word_3_acti = L.TanH(n.word_feature_3)
    n.word_4_acti = L.TanH(n.word_feature_4)
    n.word_5_acti = L.TanH(n.word_feature_5)

    n.word_2_gate = L.Sigmoid(n.word_feature_2_g)
    n.word_3_gate = L.Sigmoid(n.word_feature_3_g)
    n.word_4_gate = L.Sigmoid(n.word_feature_4_g)
    n.word_5_gate = L.Sigmoid(n.word_feature_5_g)

    n.word_2 = L.Eltwise(n.word_2_acti,
                         n.word_2_gate,
                         operation=P.Eltwise.PROD)
    n.word_3 = L.Eltwise(n.word_3_acti,
                         n.word_3_gate,
                         operation=P.Eltwise.PROD)
    n.word_4 = L.Eltwise(n.word_4_acti,
                         n.word_4_gate,
                         operation=P.Eltwise.PROD)
    n.word_5 = L.Eltwise(n.word_5_acti,
                         n.word_5_gate,
                         operation=P.Eltwise.PROD)

    n.word_vec_2 = L.Pooling(n.word_2,
                             kernel_h=T + 1,
                             kernel_w=1,
                             stride=T + 1,
                             pool=P.Pooling.MAX)
    n.word_vec_3 = L.Pooling(n.word_3,
                             kernel_h=T + 2,
                             kernel_w=1,
                             stride=T + 2,
                             pool=P.Pooling.MAX)
    n.word_vec_4 = L.Pooling(n.word_4,
                             kernel_h=T + 3,
                             kernel_w=1,
                             stride=T + 3,
                             pool=P.Pooling.MAX)
    n.word_vec_5 = L.Pooling(n.word_5,
                             kernel_h=T + 4,
                             kernel_w=1,
                             stride=T + 4,
                             pool=P.Pooling.MAX)

    word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
    n.concat_vec = L.Concat(*word_vec, concat_param={'axis':
                                                     1})  # N x 4*d_w x 1 x 1

    n.concat_vec_dropped = L.Dropout(n.concat_vec,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size):

    #prototxt 없이 network 생성시 사용
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})

    #지정된 Python 모듈 형식
    #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe
    #해당 클래스를 바탕으로 Layer를 생성하며
    #리턴된 변수에 값을 채워넣으면 자동으로 Run된다.
    #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐.

    #Glove = Global vectors for word representation
    #https://www.aclweb.org/anthology/D14-1162
    #Pretrained 된 GloveVector를 Concat에 사용.

    #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector.

    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    #module = python 파일이름
    #layer = layer형식이 맞춰진 python class
    #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다
    #ntop = 각 setup , forward backward의 top 변수의 크기

    #보통 textual Embed의 뜻은 => texture -> number
    #Embed 3000개의 Vector종류를
    #300개로 compact하게 표현함
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    #Tanh 적용
    n.embed = L.TanH(n.embed_ba)
    #Glove Data와 Concat
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})

    #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조.
    # give top2[~] the name specified by argument `slice_second`
    #변수 부여 기능
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))

    #마지막 LSTM output을 사용.
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    #lstm1의 output => 1024 reshape뒤 dropout
    #lstm2의 output => 1024 reshape뒤 dropout
    #concat

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1)  =>2048,14
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)

    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))

    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    #논문 그림과 달리 Dropout 추가
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    #논문 그림과 달리 output dim이 2
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    #softmax로 attentionmap 생성
    #14x14 Softmax map이 2개 생성

    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    #두가지 att_map을 각각 Slice
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
    #각각 ATT를 곱한값을 연산뒤 Concat한다.

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #그뒤 4096으로 Reshape

    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))

    #논문과 달리 가로축 세로축 inputVector크기가 다름
    #논문 2048 2048
    #코드 4096 2048
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    #SignedSqrt
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    #L2_Normalize
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    #Dropout
    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    #FullyConnected
    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))

    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Пример #21
0
 def net():
     n = caffe.NetSpec()
     n.data = L.Input(input_param=dict(shape=dict(dim=data_shape)))
     n.dataout = L.Tile(n.data, axis=3, tiles=3)
     return n.to_proto()
Пример #22
0
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 )

    # char embedding
    n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \
         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_c_scale = L.Scale(n.embed_c,
                              n.cont1,
                              scale_param=dict(dict(axis=0)))
    n.embed_c_scale_resh = L.Reshape(
        n.embed_c_scale,
        reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c *
                                           T, -1])))  # N x 1 x T_c x d_c
    tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2})
    for i in xrange(T):
        n.__setattr__('slice_' + str(i + 1), tops[int(i)])

    # char conv
    n.c_feature_1 = L.Convolution(
        n.slice_1,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_2 = L.Convolution(
        n.slice_2,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_3 = L.Convolution(
        n.slice_3,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_4 = L.Convolution(
        n.slice_4,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_5 = L.Convolution(
        n.slice_5,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_6 = L.Convolution(
        n.slice_6,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_7 = L.Convolution(
        n.slice_7,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_8 = L.Convolution(
        n.slice_8,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_9 = L.Convolution(
        n.slice_9,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_10 = L.Convolution(
        n.slice_10,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_11 = L.Convolution(
        n.slice_11,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_12 = L.Convolution(
        n.slice_12,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_13 = L.Convolution(
        n.slice_13,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_14 = L.Convolution(
        n.slice_14,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_15 = L.Convolution(
        n.slice_15,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_16 = L.Convolution(
        n.slice_16,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_17 = L.Convolution(
        n.slice_17,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_18 = L.Convolution(
        n.slice_18,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_19 = L.Convolution(
        n.slice_19,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_20 = L.Convolution(
        n.slice_20,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_21 = L.Convolution(
        n.slice_21,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_22 = L.Convolution(
        n.slice_22,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])

    n.c_vec_1 = L.Pooling(n.c_feature_1,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_2 = L.Pooling(n.c_feature_2,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_3 = L.Pooling(n.c_feature_3,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_4 = L.Pooling(n.c_feature_4,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_5 = L.Pooling(n.c_feature_5,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_6 = L.Pooling(n.c_feature_6,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_7 = L.Pooling(n.c_feature_7,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_8 = L.Pooling(n.c_feature_8,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_9 = L.Pooling(n.c_feature_9,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_10 = L.Pooling(n.c_feature_10,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_11 = L.Pooling(n.c_feature_11,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_12 = L.Pooling(n.c_feature_12,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_13 = L.Pooling(n.c_feature_13,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_14 = L.Pooling(n.c_feature_14,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_15 = L.Pooling(n.c_feature_15,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_16 = L.Pooling(n.c_feature_16,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_17 = L.Pooling(n.c_feature_17,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_18 = L.Pooling(n.c_feature_18,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_19 = L.Pooling(n.c_feature_19,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_20 = L.Pooling(n.c_feature_20,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_21 = L.Pooling(n.c_feature_21,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_22 = L.Pooling(n.c_feature_22,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)

    n.c_embed_1 = L.Reshape(
        n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_2 = L.Reshape(
        n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_3 = L.Reshape(
        n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_4 = L.Reshape(
        n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_5 = L.Reshape(
        n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_6 = L.Reshape(
        n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_7 = L.Reshape(
        n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_8 = L.Reshape(
        n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_9 = L.Reshape(
        n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_10 = L.Reshape(
        n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_11 = L.Reshape(
        n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_12 = L.Reshape(
        n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_13 = L.Reshape(
        n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_14 = L.Reshape(
        n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_15 = L.Reshape(
        n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_16 = L.Reshape(
        n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_17 = L.Reshape(
        n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_18 = L.Reshape(
        n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_19 = L.Reshape(
        n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_20 = L.Reshape(
        n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_21 = L.Reshape(
        n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_22 = L.Reshape(
        n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))

    concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\
     n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22]
    n.concat_char_embed = L.Concat(*concat_c_embed,
                                   concat_param={'axis': 1})  # N x T x d_c

    # word embedding
    n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w

    # combine word and char embedding
    concat_word_embed = [n.embed_w, n.concat_char_embed]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # N x T x (d_c+d_w)

    n.embed_scale = L.Scale(n.concat_embed,
                            n.cont,
                            scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(
        n.embed_scale,
        reshape_param=dict(shape=dict(
            dim=[batchsize, 1, T, -1])))  # N x 1 x T x (d_c+d_w)

    # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0)))
    # n.glove_scale_resh = L.Reshape(n.glove_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh]
    # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300

    # convolution
    n.word_feature_2 = L.Convolution(
        n.embed_scale_resh,
        kernel_h=2,
        kernel_w=300,
        stride=1,
        num_output=512,
        pad_h=1,
        pad_w=0,
        weight_filler=dict(type='xavier'))  # N x C x ? x 1
    n.word_feature_3 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=3,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=2,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_4 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=4,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=3,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_5 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=5,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=4,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_relu_2 = L.ReLU(n.word_feature_2)
    n.word_relu_3 = L.ReLU(n.word_feature_3)
    n.word_relu_4 = L.ReLU(n.word_feature_4)
    n.word_relu_5 = L.ReLU(n.word_feature_5)
    n.word_vec_2 = L.Pooling(n.word_relu_2,
                             kernel_h=T + 1,
                             kernel_w=1,
                             stride=T + 1,
                             pool=P.Pooling.MAX)  # N x C x 1 x 1
    n.word_vec_3 = L.Pooling(n.word_relu_3,
                             kernel_h=T + 2,
                             kernel_w=1,
                             stride=T + 2,
                             pool=P.Pooling.MAX)
    n.word_vec_4 = L.Pooling(n.word_relu_4,
                             kernel_h=T + 3,
                             kernel_w=1,
                             stride=T + 3,
                             pool=P.Pooling.MAX)
    n.word_vec_5 = L.Pooling(n.word_relu_5,
                             kernel_h=T + 4,
                             kernel_w=1,
                             stride=T + 4,
                             pool=P.Pooling.MAX)
    word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
    n.concat_vec = L.Concat(*word_vec, concat_param={'axis':
                                                     1})  # N x 4C x 1 x 1
    n.concat_vec_dropped = L.Dropout(n.concat_vec,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Пример #23
0
def qlstm(mode, batchsize, T, question_vocab_size, embed_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )

    # word embedding (static + dynamic)
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_scale = L.Scale(n.embed_ba, n.cont,
                            scale_param=dict(dict(axis=0)))  # N x T x d_w
    n.embed_scale_resh = L.Reshape(
        n.embed_scale,
        reshape_param=dict(shape=dict(dim=[batchsize, T, embed_size, 1])))

    # avg of word embedding
    n.embed_avg = L.Convolution(n.embed_scale_resh,
                                convolution_param={
                                    'kernel_size': 1,
                                    'num_output': 1,
                                    'bias_term': False,
                                    'weight_filler': dict(type='constant',
                                                          value=1)
                                },
                                param=dict(lr_mult=0,
                                           decay_mult=0))  # N x 1 x d_w x 1
    n.embed_avg_resh = L.Reshape(
        n.embed_avg,
        reshape_param=dict(shape=dict(dim=[batchsize, embed_size, 1, 1])))

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.embed_avg_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Пример #24
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Пример #25
0
    def lrcn_reinforce(self, save_name, RL_loss='lstm_classification', lw=20):

        data_inputs = self.data_inputs
        param_str = self.param_str

        ss_tag = 'reg_'
        #reg sentences will be the first part of the batch
        if self.separate_sents:
            if not 'batch_size' in param_str.keys():
                param_str['batch_size'] = 100
            self.slice_point = param_str['batch_size'] / 2
            self.batch_size = param_str['batch_size']

        param_str_loss = {}
        param_str_loss['vocab'] = param_str['vocabulary']
        param_str_loss['avoid_words'] = ['red', 'small']
        if self.baseline:
            param_str_loss['baseline'] = True
        data_input = 'fc8'

        data_tops = self.python_input_layer(data_inputs['module'],
                                            data_inputs['layer'], param_str)
        self.rename_tops(data_tops, data_inputs['param_str']['top_names'])
        feature_name = 'fc8'
        self.n.tops[feature_name] = L.InnerProduct(
            self.n.tops[param_str['image_data_key']],
            num_output=1000,
            weight_filler=self.uniform_weight_filler(-.08, .08),
            bias_filler=self.constant_filler(0),
            param=self.init_params([[1, 1], [2, 0]]))

        if self.cc:
            #If class conditional
            data_top = self.n.tops['fc8']
            class_top = self.n.tops[param_str['data_label_feat']]
            self.n.tops['class_input'] = L.Concat(data_top, class_top, axis=1)
            data_input = 'class_input'
        else:
            self.silence(self.n.tops[param_str['data_label_feat']])

        bottom_sent = self.n.tops[param_str['text_data_key']]
        bottom_cont = self.n.tops[param_str['text_marker_key']]

        #prep for caption model
        bottom_cont_slice = L.Slice(bottom_cont, ntop=self.T, axis=0)
        self.rename_tops(bottom_cont_slice,
                         ['bottom_cont_%d' % i for i in range(self.T)])

        if not self.separate_sents:
            bottom_sent_slice = L.Slice(bottom_sent, ntop=self.T, axis=0)
            self.rename_tops(bottom_sent_slice,
                             ['input_sent_%d' % i for i in range(self.T)])
            target_sentence = self.n.tops['target_sentence']
        else:
            bottom_sents = L.Slice(bottom_sent,
                                   slice_point=[self.slice_point],
                                   axis=1,
                                   ntop=2)
            self.rename_tops(bottom_sents, ['reg_input_sent', 'rl_input_sent'])
            reg_bottom_sents_slice = L.Slice(self.n.tops['reg_input_sent'],
                                             axis=0,
                                             ntop=20)
            rl_bottom_sents_slice = L.Slice(self.n.tops['rl_input_sent'],
                                            axis=0,
                                            ntop=20)
            self.silence([rl_bottom_sents_slice[i] for i in range(1, self.T)])
            self.n.tops['input_sent_0'] = L.Concat(reg_bottom_sents_slice[0],
                                                   rl_bottom_sents_slice[0],
                                                   axis=1)
            self.rename_tops(
                reg_bottom_sents_slice,
                ['reg_input_sent_%d' % i for i in range(1, self.T)])

            self.rename_tops(reg_bottom_sents_slice,
                             ['reg_input_sent_%d' % i for i in range(self.T)])
            slice_target_sentence = L.Slice(self.n.tops['target_sentence'],
                                            slice_point=[self.slice_point],
                                            axis=1,
                                            ntop=2)
            self.rename_tops(slice_target_sentence,
                             ['reg_target_sentence', 'rl_target_sentence'])
            self.silence(self.n.tops['rl_target_sentence'])
            target_sentence = self.n.tops['reg_target_sentence']

        self.n.tops['lstm1_h0'] = self.dummy_data_layer(
            [1, self.N, self.lstm_dim], 0)
        self.n.tops['lstm1_c0'] = self.dummy_data_layer(
            [1, self.N, self.lstm_dim], 0)
        self.n.tops['lstm2_h0'] = self.dummy_data_layer(
            [1, self.N, self.lstm_dim], 0)
        self.n.tops['lstm2_c0'] = self.dummy_data_layer(
            [1, self.N, self.lstm_dim], 0)

        self.make_caption_model(static_input=data_input)

        #prep bottoms for loss
        predict_tops = [self.n.tops['predict_%d' % i] for i in range(self.T)]
        self.n.tops['predict_concat'] = L.Concat(*predict_tops, axis=0)
        if self.separate_sents:
            word_sample_tops = [
                self.n.tops['rl_word_sample_reshape_%d' % i]
                for i in range(1, self.T + 1)
            ]
            self.n.tops['word_sample_concat'] = L.Concat(*word_sample_tops,
                                                         axis=0)
            concat_predict_tops = L.Slice(self.n.tops['predict_concat'],
                                          slice_point=[self.slice_point],
                                          axis=1,
                                          ntop=2)
            reg_predict = concat_predict_tops[0]
            RL_predict = concat_predict_tops[1]
            bottom_cont_tops = L.Slice(bottom_cont,
                                       slice_point=[self.slice_point],
                                       axis=1,
                                       ntop=2)
            self.silence(bottom_cont_tops[0])
            label_tops = L.Slice(self.n.tops[param_str['data_label']],
                                 slice_point=[self.slice_point],
                                 axis=0,
                                 ntop=2)
            self.silence(label_tops[0])
            self.rename_tops([bottom_cont_tops[1], label_tops[1]],
                             ['rl_bottom_cont', 'rl_label_top'])
            label_top = self.n.tops['rl_label_top']
            bottom_cont = self.n.tops['rl_bottom_cont']
        else:
            word_sample_tops = [
                self.n.tops['word_sample_reshape_%d' % i]
                for i in range(1, self.T + 1)
            ]
            self.n.tops['word_sample_concat'] = L.Concat(*word_sample_tops,
                                                         axis=0)
            reg_predict = self.n.tops['predict_concat']
            RL_predict = self.n.tops['predict_concat']
            label_top = self.n.tops[param_str['data_label']]

        #RL loss
        if RL_loss == 'lstm_classification':
            self.n.tops['embed_classification'] = self.embed(
                self.n.tops['word_sample_concat'],
                1000,
                input_dim=self.vocab_size,
                bias_term=False,
                learning_param=self.init_params([[0, 0]]))
            self.n.tops['lstm_classification'] = self.lstm(
                self.n.tops['embed_classification'],
                bottom_cont,
                learning_param_lstm=self.init_params([[0, 0], [0, 0], [0, 0]]),
                lstm_hidden=1000)
            self.n.tops['predict_classification'] = L.InnerProduct(
                self.n.tops['lstm_classification'], num_output=200, axis=2)
            self.n.tops['probs_classification'] = L.Softmax(
                self.n.tops['predict_classification'], axis=2)
            #classification reward layer: classification, word_sample_concat (to get sentence length),
            #data label should be single stream; even though trained with 20 stream...
            self.n.tops['reward'] = self.python_layer([
                self.n.tops['probs_classification'],
                self.n.tops['word_sample_concat'], label_top
            ], 'loss_layers', 'sequenceClassificationLoss', param_str_loss)

        self.n.tops['reward_reshape'] = L.Reshape(self.n.tops['reward'],
                                                  shape=dict(dim=[1, -1]))
        self.n.tops['reward_tile'] = L.Tile(self.n.tops['reward_reshape'],
                                            axis=0,
                                            tiles=self.T)

        #softmax with sampled words as "correct" word
        self.n.tops['sample_loss'] = self.softmax_per_inst_loss(
            RL_predict, self.n.tops['word_sample_concat'], axis=2)
        self.n.tops['sample_reward'] = L.Eltwise(self.n.tops['sample_loss'],
                                                 self.n.tops['reward_tile'],
                                                 propagate_down=[1, 0],
                                                 operation=0)
        avoid_lw = 100
        self.n.tops['normalized_reward'] = L.Power(
            self.n.tops['sample_reward'], scale=(1. / self.N) * avoid_lw)
        self.n.tops['sum_rewards'] = L.Reduction(
            self.n.tops['normalized_reward'], loss_weight=[1])
        self.n.tops['sentence_loss'] = self.softmax_loss(reg_predict,
                                                         target_sentence,
                                                         axis=2,
                                                         loss_weight=20)

        self.write_net(save_name)