def fcn(obj_cls, part, split): n = caffe.NetSpec() n.data, n.label = L.Python( module='pascalpart_layers', layer='PASCALPartSegDataLayer', ntop=2, param_str=str( dict(voc_dir='/home/cv/hdl/caffe/data/pascal/VOC', part_dir='/home/cv/hdl/caffe/data/pascal/pascal-part', obj_cls=obj_cls, part=part, split=split, seed=1337))) # the base net n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr = L.Convolution( n.drop7, num_output=11, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore2 = L.Deconvolution(n.score_fr, convolution_param=dict(num_output=11, kernel_size=4, stride=2, bias_term=False), param=[dict(lr_mult=0)]) n.score_pool4 = L.Convolution( n.pool4, num_output=11, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.score_pool4c = crop(n.score_pool4, n.upscore2) n.fuse_pool4 = L.Eltwise(n.upscore2, n.score_pool4c, operation=P.Eltwise.SUM) n.upscore16 = L.Deconvolution(n.fuse_pool4, convolution_param=dict(num_output=11, kernel_size=32, stride=16, bias_term=False), param=[dict(lr_mult=0)]) n.score = crop(n.upscore16, n.data) n.loss = L.SoftmaxWithLoss(n.score, n.label, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
def generate_model(split, config): n = caffe.NetSpec() dataset = config.dataset batch_size = config.N mode_str = str(dict(dataset=dataset, split=split, batch_size=batch_size)) n.image1, n.image2, n.label, n.sample_weights, n.feat_crop = L.Python( module=config.data_provider, layer=config.data_provider_layer, param_str=mode_str, ntop=5) ################################ # the base net (VGG-16) branch 1 n.conv1_1, n.relu1_1 = conv_relu(n.image1, 64, param_names=('conv1_1_w', 'conv1_1_b'), fix_param=True, finetune=False) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, param_names=('conv1_2_w', 'conv1_2_b'), fix_param=True, finetune=False) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, param_names=('conv2_1_w', 'conv2_1_b'), fix_param=True, finetune=False) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, param_names=('conv2_2_w', 'conv2_2_b'), fix_param=True, finetune=False) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, param_names=('conv3_1_w', 'conv3_1_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, param_names=('conv3_2_w', 'conv3_2_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, param_names=('conv3_3_w', 'conv3_3_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.pool3 = max_pool(n.relu3_3) # spatial L2 norm n.pool3_lrn = L.LRN(n.pool3, local_size=513, alpha=513, beta=0.5, k=1e-16) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, param_names=('conv4_1_w', 'conv4_1_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, param_names=('conv4_2_w', 'conv4_2_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, param_names=('conv4_3_w', 'conv4_3_b'), fix_param=config.fix_vgg, finetune=config.finetune) # spatial L2 norm n.relu4_3_lrn = L.LRN(n.relu4_3, local_size=1025, alpha=1025, beta=0.5, k=1e-16) #n.pool4 = max_pool(n.relu4_3) #n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, # param_names=('conv5_1_w', 'conv5_1_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) #n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, # param_names=('conv5_2_w', 'conv5_2_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) #n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, # param_names=('conv5_3_w', 'conv5_3_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) # upsampling feature map #n.relu5_3_upsampling = L.Deconvolution(n.relu5_3, # convolution_param=dict(num_output=512, # group=512, # kernel_size=4, # stride=2, # pad=1, # bias_term=False, # weight_filler=dict(type='bilinear')), # param=[dict(lr_mult=0, decay_mult=0)]) # spatial L2 norm #n.relu5_3_lrn = L.LRN(n.relu5_3_upsampling, local_size=1025, alpha=1025, beta=0.5, k=1e-16) # concat all skip features #n.feat_all1 = n.relu4_3_lrn n.feat_all1 = L.Concat(n.pool3_lrn, n.relu4_3_lrn, concat_param=dict(axis=1)) #n.feat_all1 = L.Concat(n.pool3_lrn, n.relu4_3_lrn, n.relu5_3_lrn, concat_param=dict(axis=1)) n.feat_all1_crop = L.Crop(n.feat_all1, n.feat_crop, crop_param=dict(axis=2, offset=[ config.query_featmap_H // 3, config.query_featmap_W // 3 ])) ################################ # the base net (VGG-16) branch 2 n.conv1_1_p, n.relu1_1_p = conv_relu(n.image2, 64, param_names=('conv1_1_w', 'conv1_1_b'), fix_param=True, finetune=False) n.conv1_2_p, n.relu1_2_p = conv_relu(n.relu1_1_p, 64, param_names=('conv1_2_w', 'conv1_2_b'), fix_param=True, finetune=False) n.pool1_p = max_pool(n.relu1_2_p) n.conv2_1_p, n.relu2_1_p = conv_relu(n.pool1_p, 128, param_names=('conv2_1_w', 'conv2_1_b'), fix_param=True, finetune=False) n.conv2_2_p, n.relu2_2_p = conv_relu(n.relu2_1_p, 128, param_names=('conv2_2_w', 'conv2_2_b'), fix_param=True, finetune=False) n.pool2_p = max_pool(n.relu2_2_p) n.conv3_1_p, n.relu3_1_p = conv_relu(n.pool2_p, 256, param_names=('conv3_1_w', 'conv3_1_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv3_2_p, n.relu3_2_p = conv_relu(n.relu3_1_p, 256, param_names=('conv3_2_w', 'conv3_2_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv3_3_p, n.relu3_3_p = conv_relu(n.relu3_2_p, 256, param_names=('conv3_3_w', 'conv3_3_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.pool3_p = max_pool(n.relu3_3_p) # spatial L2 norm n.pool3_lrn_p = L.LRN(n.pool3_p, local_size=513, alpha=513, beta=0.5, k=1e-16) n.conv4_1_p, n.relu4_1_p = conv_relu(n.pool3_p, 512, param_names=('conv4_1_w', 'conv4_1_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv4_2_p, n.relu4_2_p = conv_relu(n.relu4_1_p, 512, param_names=('conv4_2_w', 'conv4_2_b'), fix_param=config.fix_vgg, finetune=config.finetune) n.conv4_3_p, n.relu4_3_p = conv_relu(n.relu4_2_p, 512, param_names=('conv4_3_w', 'conv4_3_b'), fix_param=config.fix_vgg, finetune=config.finetune) # spatial L2 norm n.relu4_3_lrn_p = L.LRN(n.relu4_3_p, local_size=1025, alpha=1025, beta=0.5, k=1e-16) #n.pool4_p = max_pool(n.relu4_3_p) #n.conv5_1_p, n.relu5_1_p = conv_relu(n.pool4_p, 512, # param_names=('conv5_1_w', 'conv5_1_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) #n.conv5_2_p, n.relu5_2_p = conv_relu(n.relu5_1_p, 512, # param_names=('conv5_2_w', 'conv5_2_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) #n.conv5_3_p, n.relu5_3_p = conv_relu(n.relu5_2_p, 512, # param_names=('conv5_3_w', 'conv5_3_b'), # fix_param=config.fix_vgg, # finetune=config.finetune) # upsampling feature map #n.relu5_3_upsampling_p = L.Deconvolution(n.relu5_3_p, # convolution_param=dict(num_output=512, # group=512, # kernel_size=4, # stride=2, # pad=1, # bias_term=False, # weight_filler=dict(type='bilinear')), # param=[dict(lr_mult=0, decay_mult=0)]) # spatial L2 norm #n.relu5_3_lrn_p = L.LRN(n.relu5_3_upsampling_p, local_size=1025, alpha=1025, beta=0.5, k=1e-16) # concat all skip features #n.feat_all2 = n.relu4_3_lrn_p n.feat_all2 = L.Concat(n.pool3_lrn_p, n.relu4_3_lrn_p, concat_param=dict(axis=1)) #n.feat_all2 = L.Concat(n.pool3_lrn_p, n.relu4_3_lrn_p, n.relu5_3_lrn_p, concat_param=dict(axis=1)) # Dyn conv layer n.fcn_scores = L.DynamicConvolution(n.feat_all2, n.feat_all1_crop, convolution_param=dict( num_output=1, kernel_size=11, stride=1, pad=5, bias_term=False)) # scale scores with zero mean 0.01196 -> 0.02677 n.fcn_scaled_scores = L.Power(n.fcn_scores, power_param=dict(scale=0.01196, shift=-1.0, power=1)) # Loss Layer n.loss = L.WeightedSigmoidCrossEntropyLoss(n.fcn_scaled_scores, n.label, n.sample_weights) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) # word embedding (static + dynamic) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # n.embed = L.TanH(n.embed_ba) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape(n.embed_scale,\ reshape_param=dict(\ shape=dict(dim=[batchsize,1,T,300]))) # convolution n.word_feature_2 = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x 512 x ? x 1 n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_relu_2 = L.ReLU(n.word_feature_2) n.word_relu_3 = L.ReLU(n.word_feature_3) n.word_relu_4 = L.ReLU(n.word_feature_4) n.word_relu_5 = L.ReLU(n.word_feature_5) n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T+1, kernel_w=1, stride=T+1, pool=P.Pooling.MAX) # N x 512 x 1 x 1 n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T+2, kernel_w=1, stride=T+2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T+3, kernel_w=1, stride=T+3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T+4, kernel_w=1, stride=T+4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 2048 x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec,dropout_param={'dropout_ratio':0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def generate_model(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider, layer=config.data_provider_layer, param_str=mode_str, ntop=5) # the base net (VGG-16) n.conv1_1, n.relu1_1 = conv_relu(n.image, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool5 = max_pool(n.relu5_3) n.fc6, n.relu6 = fc_relu(n.pool5, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) if config.vgg_dropout: n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = fc_relu(n.drop6, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.fc8 = fc(n.drop7, 1000, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) else: n.fc7, n.relu7 = fc_relu(n.relu6, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.fc8 = fc(n.relu7, 1000, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) # embedding n.embed = L.Embed(n.language, input_dim=config.vocab_size, num_output=config.embed_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) # LSTM n.lstm = L.LSTM(n.embed, n.cont, recurrent_param=dict(num_output=config.lstm_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0))) tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0)) for i in range(config.T - 1): n.__setattr__('slice'+str(i), tops[i]) n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0)) n.lstm_out = tops[-1] n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # L2 Normalize image and language features n.img_l2norm = L.L2Normalize(n.fc8) n.lstm_l2norm = L.L2Normalize(n.lstm_feat) n.img_l2norm_resh = L.Reshape(n.img_l2norm, reshape_param=dict(shape=dict(dim=[-1, 1000]))) n.lstm_l2norm_resh = L.Reshape(n.lstm_l2norm, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # Concatenate n.feat_all = L.Concat(n.lstm_l2norm_resh, n.img_l2norm_resh, n.spatial, concat_param=dict(axis=1)) # MLP Classifier over concatenated feature n.mlp_l1, n.mlp_relu1 = fc_relu(n.feat_all, config.mlp_hidden_dims) if config.mlp_dropout: n.mlp_drop1 = L.Dropout(n.mlp_relu1, dropout_ratio=0.5, in_place=True) n.scores = fc(n.mlp_drop1, 1) else: n.scores = fc(n.mlp_relu1, 1) # Loss Layer n.loss = L.SigmoidCrossEntropyLoss(n.scores, n.label) return n.to_proto()
def build_AlexNet(split, num_classes, batch_size, resize_w, resize_h, crop_w=0, crop_h=0, crop_margin=0, mirror=0, rotate=0, HSV_prob=0, HSV_jitter=0, train=True): weight_param = dict(lr_mult=1, decay_mult=1) bias_param = dict(lr_mult=2, decay_mult=0) learned_param = [weight_param, bias_param] frozen_param = [dict(lr_mult=0)] * 2 n = caffe.NetSpec() pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892)) pydata_params['dir'] = '../../../datasets/WebVision' pydata_params['train'] = True pydata_params['batch_size'] = batch_size pydata_params['resize'] = False pydata_params['resize_w'] = resize_w pydata_params['resize_h'] = resize_h pydata_params['crop_w'] = crop_w pydata_params['crop_h'] = crop_h pydata_params['crop_margin'] = crop_margin pydata_params['mirror'] = mirror pydata_params['rotate'] = rotate pydata_params['HSV_prob'] = HSV_prob pydata_params['HSV_jitter'] = HSV_jitter pydata_params['num_classes'] = num_classes n.data, n.label, n.label_score = L.Python( module='layers', layer='customDataLayerWithLabelScore', ntop=3, param_str=str(pydata_params)) n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4, param=learned_param) n.pool1 = max_pool(n.relu1, 3, stride=2) n.norm1 = L.LRN(n.pool1, local_size=5, alpha=1e-4, beta=0.75) n.conv2, n.relu2 = conv_relu(n.norm1, 5, 256, pad=2, group=2, param=learned_param) n.pool2 = max_pool(n.relu2, 3, stride=2) n.norm2 = L.LRN(n.pool2, local_size=5, alpha=1e-4, beta=0.75) n.conv3, n.relu3 = conv_relu(n.norm2, 3, 384, pad=1, param=learned_param) n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2, param=learned_param) n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2, param=learned_param) n.pool5 = max_pool(n.relu5, 3, stride=2) n.fc6, n.relu6 = fc_relu(n.pool5, 4096, param=boosted_param) #4096 if train: n.drop6 = fc7input = L.Dropout(n.relu6, in_place=True) else: fc7input = n.relu6 n.fc7, n.relu7 = fc_relu(fc7input, 4096, param=boosted_param) #4096 if train: n.drop7 = fc8input = L.Dropout(n.relu7, in_place=True) else: fc8input = n.relu7 fc8 = L.InnerProduct(fc8input, num_output=num_classes, weight_filler=dict(type='gaussian', std=0.005), bias_filler=dict(type='constant', value=0.1), param=boosted_param) n.__setattr__('classifier', fc8) if not train: n.probs = L.Softmax(fc8) #n.loss = L.SoftmaxWithLoss(fc8, n.label) n.loss = L.Python(fc8, n.label, n.label_score, module='layers', layer='SoftmaxSoftLabel', ntop=1) n.acc = L.Accuracy(fc8, n.label) if train: with open('train.prototxt', 'w') as f: f.write(str(n.to_proto())) return f.name else: with open('val.prototxt', 'w') as f: f.write(str(n.to_proto())) return f.name
def fcn(split, tops): n = caffe.NetSpec() n.color, n.label = L.Python(module='nyud_layers', layer='NYUDSegDataLayer', ntop=3, param_str=str( dict(nyud_dir='../data/nyud', split=split, tops=tops, seed=1337))) n.data = L.Concat(n.color) # the base net n.conv1_1_bgrd, n.relu1_1 = conv_relu(n.data, 64, pad=100) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr = L.Convolution( n.drop7, num_output=40, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore = L.Deconvolution(n.score_fr, convolution_param=dict(num_output=40, kernel_size=64, stride=32, bias_term=False), param=[dict(lr_mult=0)]) n.score = crop(n.upscore, n.data) n.loss = L.SoftmaxWithLoss(n.score, n.label, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
def add_multilabel_err_layer(net, bottom, name): """ Add a MultilabelErr layer """ net[name] = L.Python(bottom[0], bottom[1], python_param=dict(module='layers.multilabel_err', layer='MultiLabelErr'))
def mfb_baseline(mode, batchsize, T, question_vocab_size, folder): n = caffe.NetSpec() mode_str = json.dumps({ 'mode': mode, 'batchsize': batchsize, 'folder': folder }) if mode == 'val': n.data, n.cont, n.img_feature, n.label = L.Python( \ module='vqa_data_layer', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4) else: n.data, n.cont, n.img_feature, n.label = L.Python( \ module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4) n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='xavier')) n.embed_tanh = L.TanH(n.embed) # LSTM n.lstm1 = L.LSTM( \ n.embed_tanh, n.cont, \ recurrent_param=dict( \ num_output=config.LSTM_UNIT_NUM, \ weight_filler=dict(type='xavier'))) tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis': 0}) for i in xrange(config.MAX_WORDS_IN_QUESTION - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out, \ reshape_param=dict( \ shape=dict(dim=[-1, 1024]))) n.q_feat = L.Dropout( n.lstm1_reshaped, dropout_param={'dropout_ratio': config.LSTM_DROPOUT_RATIO}) ''' Coarse Image-Question MFB fusion ''' n.mfb_q_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0)) n.mfb_iq_drop = L.Dropout( n.mfb_iq_eltwise, dropout_param={'dropout_ratio': config.MFB_DROPOUT_RATIO}) n.mfb_iq_resh = L.Reshape( n.mfb_iq_drop, reshape_param=dict(shape=dict( dim=[-1, 1, config.MFB_OUT_DIM, config.MFB_FACTOR_NUM]))) n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_out = L.Reshape(n.mfb_iq_sumpool, \ reshape_param=dict(shape=dict(dim=[-1, config.MFB_OUT_DIM]))) n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out) n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) if mode == 'val': n.loss = L.SoftmaxWithLoss(n.prediction, n.label) else: n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) return n.to_proto()
def fcn(mode): net = caffe.NetSpec() data_params = dict(mode = mode, mean=(104.00699, 116.66877, 122.67892), seed = 1337) if mode == 'train': data_params['data_dir']='/jet/prs/workspace/VOCdevkit/VOC2012' ##TODO data_layer = 'TrainingDataLayer' elif mode == 'val': data_params['data_dir']='/jet/prs/workspace/VOCdevkit/VOC2012' ##TODO data_layer = 'ValidDataLayer' net.data, net.label = layers.Python(module='data_layer', layer = data_layer, ntop=2, param_str = str(data_params)) # layer1 , conv+relu -> conv+relu -> max_pooling net.conv1_1 = conv(net.data, 64, pad=100) net.relu1_1 = relu(net.conv1_1) net.conv1_2 = conv(net.relu1_1, 64) net.relu1_2 = relu(net.conv1_2) net.pool1 = max_pooling(net.relu1_2) # layer2, conv+relu -> conv+relu -> max_pooling net.conv2_1 = conv(net.pool1, 128) net.relu2_1 = relu(net.conv2_1) net.conv2_2 = conv(net.relu2_1, 128) net.relu2_2 = relu(net.conv2_2) net.pool2 = max_pooling(net.relu2_2) # layer3, conv+relu -> conv+relu -> max_pooling net.conv3_1 = conv(net.pool2, 256) net.relu3_1 = relu(net.conv3_1) net.conv3_2 = conv(net.relu3_1, 256) net.relu3_2 = relu(net.conv3_2) net.conv3_3 = conv(net.relu3_2, 256) net.relu3_3 = relu(net.conv3_3) net.pool3 = max_pooling(net.relu3_3) # layer4, conv+relu -> conv+relu -> max_pooling net.conv4_1 = conv(net.pool3, 512) net.relu4_1 = relu(net.conv4_1) net.conv4_2 = conv(net.relu4_1, 512) net.relu4_2 = relu(net.conv4_2) net.conv4_3 = conv(net.relu4_2, 512) net.relu4_3 = relu(net.conv4_3) net.pool4 = max_pooling(net.relu4_3) # layer5, conv+relu -> conv+relu -> max_pooling net.conv5_1 = conv(net.pool4, 512) net.relu5_1 = relu(net.conv5_1) net.conv5_2 = conv(net.relu5_1, 512) net.relu5_2 = relu(net.conv5_2) net.conv5_3 = conv(net.relu5_2, 512) net.relu5_3 = relu(net.conv5_3) net.pool5 = max_pooling(net.relu5_3) # layer6, conv + relu -> dropout net.fc6 = conv(net.pool5, 4096, ks=7, pad=0) net.relu6 = relu(net.fc6) net.drop6 = dropout(net.relu6) # layer7, conv + relu -> dropout net.fc7 = conv(net.drop6, 4096, ks=1, pad=0) net.relu7 = relu(net.fc7) net.drop7 = dropout(net.relu7) # layer8, forward score net.score_fr = conv(net.drop7, 21, ks=1, pad=0) net.upscore1 = deconv(net.score_fr, 21, ks=4, stride = 2) # layer9, net.score_pool4 = conv(net.pool4, 21, ks=1, pad=0) net.score_pool4_crop = crop(net.score_pool4, net.upscore1) net.integrate_pool4 = sumup(net.upscore1, net.score_pool4_crop) net.upscore2 = deconv(net.integrate_pool4, 21, ks=32, stride = 16) net.score = crop(net.upscore2, net.data) net.loss = softmax(net.score, net.label) # layer9, skip with layer4: conv -> crop -> sum up -> deconv #net.score2_1 = conv(net.pool4, 21, ks=1, pad=0) #net.score2_1c = crop(net.score2_1, net.upscore1_1) #net.sum_score2_1 = sumup(net.upscore1_1, net.score2_1c) #net.upscore2_1 = deconv(net.sum_score2_1, 21) # layer10, skip with layer3: conv->crop->sum up->deconv #net.score3_1 = conv(net.pool3, 21, ks=1, pad=0) #net.score3_1c = crop(net.score3_1, net.upscore2_1) #net.sum_score3_1 = sumup(net.upscore2_1, net.score3_1c) #net.upscore3_1 = deconv(net.sum_score3_1, 21) #net.score = crop(net.upscore3_1, net.data) #net.loss = softmax(net.score, net.data) return net.to_proto()
def generate_conv_features(split, config): n = caffe.NetSpec() dataset = config.dataset batch_size = config.N mode_str = str(dict(dataset=dataset, split=split, batch_size=batch_size)) n.image, n.label = L.Python(module=config.data_provider, layer=config.data_provider_layer_1, param_str=mode_str, ntop=2) # the base net (VGG-16) n.conv1_1, n.relu1_1 = conv_relu(n.image, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool3 = max_pool(n.relu3_3) # spatial L2 norm n.pool3_lrn = L.LRN(n.pool3, local_size=513, alpha=513, beta=0.5, k=1e-16) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) # spatial L2 norm n.relu4_3_lrn = L.LRN(n.relu4_3, local_size=1025, alpha=1025, beta=0.5, k=1e-16) #n.pool4 = max_pool(n.relu4_3) #n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, # fix_param=config.fix_vgg, # finetune=(not config.fix_vgg)) #n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, # fix_param=config.fix_vgg, # finetune=(not config.fix_vgg)) #n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, # fix_param=config.fix_vgg, # finetune=(not config.fix_vgg)) # upsampling feature map #n.relu5_3_upsampling = L.Deconvolution(n.relu5_3, # convolution_param=dict(num_output=512, # group=512, # kernel_size=4, # stride=2, # pad=1, # bias_term=False, # weight_filler=dict(type='bilinear')), # param=[dict(lr_mult=0, decay_mult=0)]) # spatial L2 norm #n.relu5_3_lrn = L.LRN(n.relu5_3_upsampling, local_size=1025, alpha=1025, beta=0.5, k=1e-16) # concat all skip features #n.feat_all = n.relu4_3_lrn n.feat_all = L.Concat(n.pool3_lrn, n.relu4_3_lrn, concat_param=dict(axis=1)) #n.feat_all = L.Concat(n.pool3_lrn, n.relu4_3_lrn, n.relu5_3_lrn, concat_param=dict(axis=1)) return n.to_proto()
def fcn(split): n = caffe.NetSpec() if split=='train': data_params = dict(mean=(104.00699, 116.66877, 122.67892)) data_params['root'] = 'data/HED-BSDS' data_params['source'] = "train_pair.lst" data_params['shuffle'] = True n.data, n.label = L.Python(module='pylayer', layer='ImageLabelmapDataLayer', ntop=2, \ param_str=str(data_params)) elif split == 'test': n.data = L.Input(name = 'data', input_param=dict(shape=dict(dim=[1,3,500,500]))) else: raise Exception("Invalid phase") n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=1) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, mult=[100,1,200,0]) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, mult=[100,1,200,0]) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, mult=[100,1,200,0]) # DSN1 n.score_dsn1=full_conv(n.conv1_2, 'score-dsn1', lr=1) n.upscore_dsn1 = crop(n.score_dsn1, n.data) if split=='train': n.loss1 = L.BalanceCrossEntropyLoss(n.upscore_dsn1, n.label, loss_param=dict(normalize=False)) if split=='test': n.sigmoid_dsn1 = L.Sigmoid(n.upscore_dsn1) # DSN2 n.score_dsn2 = full_conv(n.conv2_2, 'score-dsn2', lr=1) n.score_dsn2_up = upsample(n.score_dsn2, stride=2) n.upscore_dsn2 = crop(n.score_dsn2_up, n.data) if split=='train': n.loss2 = L.BalanceCrossEntropyLoss(n.upscore_dsn2, n.label, loss_param=dict(normalize=False)) if split=='test': n.sigmoid_dsn2 = L.Sigmoid(n.upscore_dsn2) # DSN3 n.score_dsn3=full_conv(n.conv3_3, 'score-dsn3', lr=1) n.score_dsn3_up = upsample(n.score_dsn3, stride=4) n.upscore_dsn3 = crop(n.score_dsn3_up, n.data) if split=='train': n.loss3 = L.BalanceCrossEntropyLoss(n.upscore_dsn3, n.label, loss_param=dict(normalize=False)) if split=='test': n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) # DSN4 n.score_dsn4 = full_conv(n.conv4_3, 'score-dsn4', lr=1) n.score_dsn4_up = upsample(n.score_dsn4, stride=8) n.upscore_dsn4 = crop(n.score_dsn4_up, n.data) if split=='train': n.loss4 = L.BalanceCrossEntropyLoss(n.upscore_dsn4, n.label, loss_param=dict(normalize=False)) if split=='test': n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) # DSN5 n.score_dsn5=full_conv(n.conv5_3, 'score-dsn5', lr=1) n.score_dsn5_up = upsample(n.score_dsn5, stride=16) n.upscore_dsn5 = crop(n.score_dsn5_up, n.data) if split=='train': n.loss5 = L.BalanceCrossEntropyLoss(n.upscore_dsn5, n.label, loss_param=dict(normalize=False)) elif split=='test': n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) else: raise Exception("Error") # concat and fuse n.concat_upscore = L.Concat(n.upscore_dsn1, n.upscore_dsn2, n.upscore_dsn3, n.upscore_dsn4, n.upscore_dsn5, name='concat', concat_param=dict({'concat_dim':1})) n.upscore_fuse = L.Convolution(n.concat_upscore, name='new-score-weighting', num_output=1, kernel_size=1, param=[dict(lr_mult=0.001, decay_mult=1), dict(lr_mult=0.002, decay_mult=0)], weight_filler=dict(type='constant', value=0.2)) if split=='test': n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) if split=='train': n.loss_fuse = L.BalanceCrossEntropyLoss(n.upscore_fuse, n.label, loss_param=dict(normalize=False)) return n.to_proto()
def cnn(split): n = caffe.NetSpec() pydata_params = dict(dataset_dir='/home/kevin/dataset/processed_data3', variable='depth_map', split=split, mean=(2), seed=1337, batch_size=256, frame_num=30, img_size=(227, 227)) if split == 'deploy': n.img = L.Input( name='input', ntop=2, shape=[dict(dim=1), dict(dim=1), dict(dim=227), dict(dim=227)]) else: if split is 'train': pydata_params['dtype'] = 'frame' pylayer = 'ModelNetDataLayer' else: pydata_params['dtype'] = 'object' pylayer = 'ModelNetDataLayer' n.img, n.label = L.Python(module='data_layers.model_net_layer', layer=pylayer, ntop=2, param_str=str(pydata_params)) # the base net n.conv1, n.relu1 = conv_relu("conv1", n.img, 96, ks=11, stride=4, pad=0) n.pool1 = max_pool(n.relu1, ks=3) n.norm1 = L.LRN(n.pool1, lrn_param=dict(local_size=5, alpha=0.0005, beta=0.75, k=2)) n.conv2, n.relu2 = conv_relu("conv2", n.norm1, 256, ks=5, pad=2, group=2) n.pool2 = max_pool(n.relu2, ks=3) n.norm2 = L.LRN(n.pool2, lrn_param=dict(local_size=5, alpha=0.0005, beta=0.75, k=2)) n.conv3, n.relu3 = conv_relu("conv3", n.norm2, 384, ks=3, pad=1) n.conv4, n.relu4 = conv_relu("conv4", n.relu3, 384, ks=3, pad=1, group=2) n.conv5, n.relu5 = conv_relu("conv5", n.relu4, 256, ks=3, pad=1, group=2) n.pool5 = max_pool(n.relu5, ks=3) n.fc6, n.relu6 = fc_relu(n.pool5, 4096, lr1=1, lr2=2) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = fc_relu(n.drop6, 4096, lr1=1, lr2=2) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.fc8 = fc(n.drop7, 40, lr1=1, lr2=2) if split != 'deploy': n.accuracy = L.Accuracy(n.fc8, n.label) n.loss = L.SoftmaxWithLoss(n.fc8, n.label) #n.loss = L.Python(n.fc8, n.label, loss_weight=1, module='nn_layers.max_softmax_loss_layer', layer='MaxSoftmaxLossLayer') # n.display = L.Scale(n.corr, param=[dict(lr_mult=0)], filler=dict(type='constant',value=1.0)) # n.fc9_bn = L.BatchNorm(n.relu9, param=[dict(lr_mult=0),dict(lr_mult=0),dict(lr_mult=0)], batch_norm_param=dict(use_global_stats=True)) return n.to_proto()
def python_layer(self, inputs, module, layer, param_str, ntop=1): return L.Python(*inputs, module=module, layer=layer, param_str=str(param_str), ntop=1)
def python_input_layer(self, module, layer, param_str): tops = L.Python(module=module, layer=layer, param_str=str(param_str), ntop=len(param_str['top_names'])) self.rename_tops(tops, param_str['top_names'])
def pj_x(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights) n.embed = L.TanH(n.embed_ba) n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_embed = L.TanH(n.exp_embed_ba) # LSTM1 n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[T-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[T-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) # Tile question feature n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14) n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14) # Embed image feature n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) # Eltwise product and normalization n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_sqrt = L.SignedSqrt(n.eltwise) n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt) n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for VQA n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048]))) # eltwise product + normalization again for VQA n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights) n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2) n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt) n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3}) n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) # Embed VQA GT answer during training n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Merge VQA answer and visual+textual feature n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.eltwise_emb, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier')) n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD}) # LSTM1 for Explanation n.exp_lstm1 = L.LSTM(\ n.exp_embed, n.exp_cont_1,\ recurrent_param=dict(\ num_output=2048,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,dropout_param={'dropout_ratio':0.3}) # merge with LSTM1 for explanation n.exp_att_resh = L.Reshape(n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048]))) n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T) n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped, n.exp_att_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all) n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt) n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2, dropout_param={'dropout_ratio': 0.3}) # LSTM2 for Explanation n.exp_lstm2 = L.LSTM(\ n.exp_eltwise_all_drop, n.exp_cont_2,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,dropout_param={'dropout_ratio':0.3}) n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped, num_output=exp_vocab_size, weight_filler=dict(type='xavier'), axis=2) n.exp_loss = L.SoftmaxWithLoss(n.exp_prediction, n.exp_out, loss_param=dict(ignore_label=-1), softmax_param=dict(axis=2)) n.exp_accuracy = L.Accuracy(n.exp_prediction, n.exp_out, axis=2, ignore_label=-1) return n.to_proto()
def createEmbeddingNetwork(self, database_list_path='.', batch_size=20, phase=0): dataset_path = database_list_path dataLayer = L.HDF5Data(name='dataLayer', source=dataset_path, batch_size=batch_size, ntop=2+self.number_of_neighbors, include=list([dict(phase=phase)]))# tops-> target, [neighbors], negative #data -> [target, neighbor1, neighbor2, ..., neighbork, negative] self.net.target = dataLayer[0] self.net.negative = dataLayer[-1] for l in range(1, self.number_of_neighbors+1): setattr(self.net, 'neighbor{0}'.format(l-1), dataLayer[l]) #First layer of inner product self.net.inner_product_target = self.getInnerProduct('target', 'inner_product_target', 1) self.net.inner_product_negative = self.getInnerProduct('negative', 'inner_product_negative', 1) for i in range(0, self.number_of_neighbors): layer = self.getInnerProduct('neighbor{0}'.format(i), 'inner_product_neighbor{0}'.format(i), 1) setattr(self.net, 'inner_product_neighbor{0}'.format(i), layer) #ReLU self.net.relu_target = L.ReLU(self.net.inner_product_target, name='relu_target', in_place=True) self.net.relu_negative = L.ReLU(self.net.inner_product_negative, name='relu_negative', in_place=True) for i in range(0, self.number_of_neighbors): layer = L.ReLU(getattr(self.net, 'inner_product_neighbor{0}'.format(i)), name='relu_neighbor{0}'.format(i), in_place=True) setattr(self.net, 'relu_neighbor{0}'.format(i), layer) #Second layer of inner product #self.net.inner_product2_target = self.getInnerProduct('inner_product_target', 'inner_product2_target', 2) #self.net.inner_product2_negative = self.getInnerProduct('inner_product_negative', 'inner_product2_negative', 2) #for i in range(0, self.number_of_neighbors): # layer = self.getInnerProduct('inner_product_neighbor{0}'.format(i), # 'inner_product2_neighbor{0}'.format(i), 2) # setattr(self.net, 'inner_product2_neighbor{0}'.format(i), layer) #Context ''' context_sum_bottom = [] for i in range(0, self.number_of_neighbors): context_sum_bottom.append(getattr(self.net, 'inner_product2_neighbor{0}'.format(i))) coeff = 1.0/self.number_of_neighbors self.net.context_sum = L.Eltwise(*context_sum_bottom, name='context_sum', operation=P.Eltwise.SUM, # 1 -> SUM coeff=list([coeff for i in range(self.number_of_neighbors)])) #Target - Negative self.net.target_negative_diff = L.Eltwise(self.net.inner_product2_target, self.net.inner_product2_negative, name='target_negative_diff', operation=P.Eltwise.SUM, # SUM coeff=list([1,-1])) # target - negative ''' #Context context_sum_bottom = [] for i in range(0, self.number_of_neighbors): context_sum_bottom.append(getattr(self.net, 'inner_product_neighbor{0}'.format(i))) coeff = 1.0/self.number_of_neighbors self.net.context_sum = L.Eltwise(*context_sum_bottom, name='context_sum', operation=P.Eltwise.SUM, # SUM coeff=list([coeff for i in range(self.number_of_neighbors)])) #Target - Negative self.net.target_negative_diff = L.Eltwise(self.net.inner_product_target, self.net.inner_product_negative, name='target_negative_diff', operation=P.Eltwise.SUM, # SUM coeff=list([1,-1])) # target - negative #Loss layer self.net.loss = L.Python(self.net.context_sum, self.net.target_negative_diff, name='loss', module='my_dot_product_layer', layer='MyHingLossDotProductLayer')
def fcn(split): n = caffe.NetSpec() pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892), seed=1337) if split == 'train': pydata_params['sbdd_dir'] = '../data/sbdd/dataset' pylayer = 'SBDDSegDataLayer' else: pydata_params['voc_dir'] = '/home/tramac/mydata/VOCdevkit/VOC2012' pylayer = 'VOCSegDataLayer' n.data, n.label = L.Python(module='voc_layers', layer=pylayer, ntop=2, param_str=str(pydata_params)) # the base net n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr = L.Convolution( n.drop7, num_output=21, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore = L.Deconvolution(n.score_fr, convolution_param=dict(num_output=21, kernel_size=64, stride=32, bias_term=False), param=[dict(lr_mult=0)]) n.score = crop(n.upscore, n.data) n.loss = L.SoftmaxWithLoss(n.score, n.label, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
pydata_params = dict(split=split_train, mean=(104, 117, 123)) pydata_params['dir'] = '../../../datasets/WebVision' pydata_params['train'] = True pydata_params['num_classes'] = num_labels pydata_params['batch_size'] = batch_size pydata_params['resize'] = resize pydata_params['resize_w'] = resize_w pydata_params['resize_h'] = resize_h pydata_params['crop_w'] = crop_w pydata_params['crop_h'] = crop_h pydata_params['crop_margin'] = crop_margin pydata_params['mirror'] = mirror pydata_params['rotate_prob'] = rotate_prob pydata_params['rotate_angle'] = rotation_angle pydata_params['HSV_prob'] = HSV_prob pydata_params['HSV_jitter'] = HSV_jitter pydata_params['color_casting_prob'] = color_casting_prob pydata_params['color_casting_jitter'] = color_casting_jitter pydata_params['scaling_prob'] = scaling_prob pydata_params['scaling_factor'] = scaling_factor pylayer = 'customDataLayer' n.data, n.label = L.Python(module='layers', layer=pylayer, ntop=2, param_str=str(pydata_params)) with open('prototxt/data_layer.prototxt', 'w') as f: f.write(str(n.to_proto()))
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed = L.TanH(n.embed_ba) # concat_word_embed = [n.embed, n.glove] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def net(split): n = caffe.NetSpec() loss_param = dict(normalize=False) if split=='train': data_params = dict(mean=(104.00699, 116.66877, 122.67892)) # 图像与标签 data_params['root'] = './datasets/Total_Text_WSR' data_params['source'] = "Total_Text_WSR.lst" data_params['shuffle'] = True data_params['ignore_label'] = -1 n.data, n.label = L.Python(module='pylayer_old', layer='ImageLabelmapDataLayer', ntop=2, \ param_str=str(data_params)) if data_params.has_key('ignore_label'): loss_param['ignore_label'] = int(data_params['ignore_label']) elif split == 'test': n.data = L.Input(name = 'data', input_param=dict(shape=dict(dim=[1,3,500,500]))) else: raise Exception("Invalid phase") #第一个卷积阶段 n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=1) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) #第二个卷积阶段 n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) #第三个卷积阶段 n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) # 第三个卷积阶段最后一个卷积层,接一个MCFE模块, Channel: 64, kernel: 3*3 n.conv3_dilation1 = conv_dilation01(n.conv3_3, mult=[100,1,200,0]) n.conv3_dilation2 = conv_dilation03(n.conv3_3, mult=[100,1,200,0]) n.conv3_dilation3 = conv_dilation05(n.conv3_3, mult=[100,1,200,0]) n.conv3_dilation4 = conv_dilation07(n.conv3_3, mult=[100,1,200,0]) # 在Channel维度上进行拼接 n.concat_conv33 = L.Concat(n.conv3_dilation1, n.conv3_dilation2, n.conv3_dilation3, n.conv3_dilation4, concat_param=dict({'concat_dim':1})) # MCFE模块后接BLSTM module # # ===================== prepare lstm inputs ===================== n.im2col_conv33 = L.Im2col(n.concat_conv33, convolution_param=dict(kernel_size=3, pad=1)) n.im2col_transpose_conv33 = L.Transpose(n.im2col_conv33, transpose_param =dict(dim=[3,2,0,1])) n.lstm_input_conv33 = L.Reshape(n.im2col_transpose_conv33, reshape_param =dict(shape=dict(dim=-1), axis=1, num_axes=2)) # 前向LSTM n.lstm_conv33 = L.Lstm(n.lstm_input_conv33,lstm_param =dict(num_output=128,weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), clipping_threshold=1)) #后向LSTM n.rlstm_input_conv33 = L.Reverse(n.lstm_input_conv33, name='lstm_reverse1_conv33', reverse_param =dict(axis=0)) n.rlstm_output_conv33= L.Lstm(n.rlstm_input_conv33, name='rlstm_conv33', lstm_param =dict(num_output=128)) n.rlstm_conv33 = L.Reverse(n.rlstm_output_conv33, name='lstm_reverse2_conv33', reverse_param =dict(axis=0)) # lstm_conv33 和 rlstm_conv33经过Concat拼接,n*c*(h1+h2+...+hk)*w n.merge_lstm_rlstm_conv33 = L.Concat(n.lstm_conv33, n.rlstm_conv33, concat_param=dict(axis=2)) n.lstm_output_reshape_conv33 = L.Reshape(n.merge_lstm_rlstm_conv33, reshape_param=dict(shape=dict(dim=[-1,1]), axis=1, num_axes=1)) # transpose size of output as (N, C, H, W) n.lstm_output_conv33 = L.Transpose(n.lstm_output_reshape_conv33,transpose_param=dict(dim=[2,3,1,0])) n.pool3 = max_pool(n.relu3_3) # 第四个卷积阶段 n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) # 第三个卷积阶段最后一个卷积层,接一个MCFE模块, Channel: 128, kernel: 3*3 n.conv4_dilation1 = conv_dilation1(n.conv4_3, mult=[100,1,200,0]) n.conv4_dilation2 = conv_dilation3(n.conv4_3, mult=[100,1,200,0]) n.conv4_dilation3 = conv_dilation5(n.conv4_3, mult=[100,1,200,0]) n.conv4_dilation4 = conv_dilation7(n.conv4_3, mult=[100,1,200,0]) # 在Channel维度上进行拼接, n*(c1+c2+...+ck)*h*w n.concat_conv43 = L.Concat(n.conv4_dilation1, n.conv4_dilation2, n.conv4_dilation3, n.conv4_dilation4, concat_param=dict({'concat_dim':1})) # BLSTM module # # ===================== prepare lstm inputs ===================== n.im2col_conv43 = L.Im2col(n.concat_conv43, convolution_param=dict(kernel_size=3, pad=1)) n.im2col_transpose_conv43 = L.Transpose(n.im2col_conv43, transpose_param =dict(dim=[3,2,0,1])) n.lstm_input_conv43 = L.Reshape(n.im2col_transpose_conv43, reshape_param =dict(shape=dict(dim=-1), axis=1, num_axes=2)) # 前向LSTM n.lstm_conv43 = L.Lstm(n.lstm_input_conv43,lstm_param =dict(num_output=256,weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), clipping_threshold=1)) # 后向LSTM n.rlstm_input_conv43 = L.Reverse(n.lstm_input_conv43, name='lstm_reverse1_conv43', reverse_param =dict(axis=0)) n.rlstm_output_conv43= L.Lstm(n.rlstm_input_conv43, name='rlstm_conv43', lstm_param =dict(num_output=256)) n.rlstm_conv43 = L.Reverse(n.rlstm_output_conv43, name='lstm_reverse2_conv43', reverse_param =dict(axis=0)) #lstm_conv43 和 rlstm_conv43经Concat拼接,n*c*(h1+h2+...+hk)*w n.merge_lstm_rlstm_conv43 = L.Concat(n.lstm_conv43, n.rlstm_conv43, concat_param=dict(axis=2)) n.lstm_output_reshape_conv43 = L.Reshape(n.merge_lstm_rlstm_conv43, reshape_param=dict(shape=dict(dim=[-1,1]), axis=1, num_axes=1)) # transpose size of output as (N, C, H, W) n.lstm_output_conv43 = L.Transpose(n.lstm_output_reshape_conv43,transpose_param=dict(dim=[2,3,1,0])) n.pool4 = max_pool(n.relu4_3) # The fiveth conv stage n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) # MCFE inception module, Channel: 128, kernel: 3*3 n.conv5_dilation1 = conv_dilation1(n.conv5_3, mult=[100,1,200,0]) n.conv5_dilation2 = conv_dilation3(n.conv5_3, mult=[100,1,200,0]) n.conv5_dilation3 = conv_dilation5(n.conv5_3, mult=[100,1,200,0]) n.conv5_dilation4 = conv_dilation7(n.conv5_3, mult=[100,1,200,0]) n.concat_conv53 = L.Concat(n.conv5_dilation1, n.conv5_dilation2, n.conv5_dilation3, n.conv5_dilation4, concat_param=dict({'concat_dim':1})) # BLSTM module # ===================== prepare lstm inputs ===================== n.im2col_conv53 = L.Im2col(n.concat_conv53, convolution_param=dict(kernel_size=3, pad=1)) n.im2col_transpose_conv53 = L.Transpose(n.im2col_conv53, transpose_param =dict(dim=[3,2,0,1])) n.lstm_input_conv53 = L.Reshape(n.im2col_transpose_conv53, reshape_param =dict(shape=dict(dim=-1), axis=1, num_axes=2)) # 前向LSTM n.lstm_conv53 = L.Lstm(n.lstm_input_conv53,lstm_param =dict(num_output=256,weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), clipping_threshold=1)) #后向LSTM n.rlstm_input_conv53 = L.Reverse(n.lstm_input_conv53, name='lstm_reverse1_conv53', reverse_param =dict(axis=0)) n.rlstm_output_conv53= L.Lstm(n.rlstm_input_conv53, name='rlstm_conv53', lstm_param =dict(num_output=256)) n.rlstm_conv53 = L.Reverse(n.rlstm_output_conv53, name='lstm_reverse2_conv53', reverse_param =dict(axis=0)) # lstm_conv53和rlstm_conv53经过Concat拼接,n*c*(h1+h2+...+hk)*w n.merge_lstm_rlstm_conv53 = L.Concat(n.lstm_conv53, n.rlstm_conv53, concat_param=dict(axis=2)) n.lstm_output_reshape_conv53 = L.Reshape(n.merge_lstm_rlstm_conv53, reshape_param=dict(shape=dict(dim=[-1,1]), axis=1, num_axes=1)) # transpose size of output as (N, C, H, W) n.lstm_output_conv53 = L.Transpose(n.lstm_output_reshape_conv53,transpose_param=dict(dim=[2,3,1,0])) # 第三个阶段,BLSTM的输出,经过1x1的卷积降维,4x上采样,裁剪成与原图像大小相同 n.score_dsn3 = conv1x1(n.lstm_output_conv33, lr=[0.01, 1, 0.02, 0], wf=dict(type='gaussian', std=0.01)) n.score_dsn3_up = upsample(n.score_dsn3, stride=4) n.upscore_dsn3 = L.Crop(n.score_dsn3_up, n.data) # BalanceCrossEntropyLoss if split=='train': n.loss3 = L.BalanceCrossEntropyLoss(n.upscore_dsn3, n.label, loss_param=loss_param) if split=='test': n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) #第四个阶段,BLSTM的输出,经过1x1的卷积降维,8x上采样,裁剪成与原图像大小相同 n.score_dsn4 = conv1x1(n.lstm_output_conv43, lr=[0.01, 1, 0.02, 0], wf=dict(type='gaussian', std=0.01)) n.score_dsn4_up = upsample(n.score_dsn4, stride=8) n.upscore_dsn4 = L.Crop(n.score_dsn4_up, n.data) # BalanceCrossEntropyLoss if split=='train': n.loss4 = L.BalanceCrossEntropyLoss(n.upscore_dsn4, n.label, loss_param=loss_param) if split=='test': n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) # 第五个阶段,BLSTM的输出,经过1x1的卷积降维,16x上采样,裁剪成与原图像大小相同 n.score_dsn5 = conv1x1(n.lstm_output_conv53, lr=[0.01, 1, 0.02, 0], wf=dict(type='gaussian', std=0.01)) n.score_dsn5_up = upsample(n.score_dsn5, stride=16) n.upscore_dsn5 = L.Crop(n.score_dsn5_up, n.data) # BalanceCrossEntropyLoss if split=='train': n.loss5 = L.BalanceCrossEntropyLoss(n.upscore_dsn5, n.label, loss_param=loss_param) if split=='test': n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) # 将三个阶段的输出,在Channel维度上进行拼接,作为Attention模块的输入 n.concat_upscore = L.Concat(n.upscore_dsn3, n.upscore_dsn4, n.upscore_dsn5, name='concat', concat_param=dict({'concat_dim':1})) # upscore_dsn3,upscore_dsn4,upscore_dsn5经3X3的卷积, 降维 n.output_mask_product03 = L.Convolution(n.upscore_dsn3, num_output=1, kernel_size=3,pad=1, param=[dict(lr_mult=10, decay_mult=1), dict(lr_mult=20, decay_mult=0)], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), engine=1) n.output_mask_product04 = L.Convolution(n.upscore_dsn4, num_output=1, kernel_size=3,pad=1, param=[dict(lr_mult=10, decay_mult=1), dict(lr_mult=20, decay_mult=0)], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), engine=1) n.output_mask_product05 = L.Convolution(n.upscore_dsn5, num_output=1, kernel_size=3,pad=1, param=[dict(lr_mult=10, decay_mult=1), dict(lr_mult=20, decay_mult=0)], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant'), engine=1) ### Attention 模块 # 第一个卷积层num_output=512, kernel_size:3x3 n.att_conv1_mask_512 = L.Convolution(n.concat_upscore, num_output=512, kernel_size=3,pad=1, param=[dict(lr_mult=10, decay_mult=1), dict(lr_mult=20, decay_mult=0)], engine=1) n.relu_att_conv1 = L.ReLU(n.att_conv1_mask_512, in_place=True) n.drop_att_conv1_mask = L.Dropout(n.relu_att_conv1, dropout_ratio=0.5, in_place=True) # 第二个卷积层num_output=3, kernel_size:1x1 n.att_fc_mask_512 = L.Convolution(n.drop_att_conv1_mask, num_output=3, kernel_size=1, param=[dict(lr_mult=10, decay_mult=1), dict(lr_mult=20, decay_mult=0)], engine=1) n.attention = L.Softmax(n.att_fc_mask_512) # 生成三个注意力权重 n.attention3,n.attention4,n.attention5= L.Slice(n.attention, name='slice_attention', slice_param=dict(axis=1, slice_point=[1,2]), ntop=3) # 注意力权重与feature map相乘,进行融合 n.output_mask3 = L.Eltwise(n.attention3, n.output_mask_product03,operation=P.Eltwise.PROD) n.output_mask4 = L.Eltwise(n.attention4, n.output_mask_product04,operation=P.Eltwise.PROD) n.output_mask5 = L.Eltwise(n.attention5, n.output_mask_product05,operation=P.Eltwise.PROD) n.output_fusion = L.Eltwise(n.output_mask3, n.output_mask4, n.output_mask5, operation=P.Eltwise.SUM) #作为对比,不经过Attention模块, 将三个阶段的输出,在Channel维度上进行拼接,经1X1的卷积,输出 n.upscore_fuse = L.Convolution(n.concat_upscore, name='new-score-weighting', num_output=1, kernel_size=1, param=[dict(lr_mult=0.001, decay_mult=1), dict(lr_mult=0.002, decay_mult=0)], weight_filler=dict(type='constant', value=0.2), engine=1) if split=='train': n.loss_fuse = L.BalanceCrossEntropyLoss(n.upscore_fuse, n.label, loss_param=loss_param) n.loss_output_fusion = L.BalanceCrossEntropyLoss(n.output_fusion, n.label, loss_param=loss_param) if split=='test': n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) n.sigmoid_output_fusion= L.Sigmoid(n.output_fusion) return n.to_proto()
def generate_fc8(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider, layer=config.data_provider_layer, param_str=mode_str, ntop=5) # the base net (VGG-16) n.conv1_1, n.relu1_1 = conv_relu(n.image, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool5 = max_pool(n.relu5_3) n.fc6, n.relu6 = fc_relu(n.pool5, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) if config.vgg_dropout: n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = fc_relu(n.drop6, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.fc8 = fc(n.drop7, 1000, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) else: n.fc7, n.relu7 = fc_relu(n.relu6, 4096, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.fc8 = fc(n.relu7, 1000, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) return n.to_proto()
def vgg_face(split, mean, opt): n = caffe.NetSpec() # config python data layer if split == 'train': batch_size = opt.train_batch_size if split == 'val': batch_size = opt.val_batch_size if split == 'test': batch_size = opt.test_batch_size if split == 'train' or split == 'val': dataset_name = opt.train_dataset_name else: dataset_name = opt.test_dataset_name pydata_params = dict(split=split, data_dir=opt.data_dir, batch_size=batch_size, mean=mean, dataset=dataset_name, load_size=opt.load_size, crop_size=opt.crop_size) n.data, n.label = L.Python(module='faceData_layers', layer='FaceDataLayer', ntop=2, param_str=str(pydata_params)) # vgg-face net # conv layers n.conv1_1, n.relu1_1 = conv_relu(n.data, 64) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # drop out and fc layers n.fc6, n.relu6, n.drop6 = fc_relu_dropout(n.pool5, 4096, 0.5) n.fc7, n.relu7, n.drop7 = fc_relu_dropout(n.fc6, 4096, 0.5) lr_ratio = 100 # lr multiplier for truncated layers n.fc8_face = L.InnerProduct(n.fc7, num_output=1024, param=[ dict(lr_mult=1 * lr_ratio, decay_mult=1), dict(lr_mult=2 * lr_ratio, decay_mult=0) ], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)) n.fc9_face = L.InnerProduct(n.fc8_face, num_output=2, param=[ dict(lr_mult=1 * lr_ratio, decay_mult=1), dict(lr_mult=2 * lr_ratio, decay_mult=0) ], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)) # loss layer n.loss = L.SoftmaxWithLoss(n.fc9_face, n.label) # loss and accuracy layer n.acc = L.Accuracy(n.fc9_face, n.label) return n.to_proto()
def fcn(split): n = caffe.NetSpec() pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892), seed=1337) if split == 'train': pydata_params['sbdd_dir'] = '../data/sbdd/dataset' pylayer = 'SBDDSegDataLayer' else: pydata_params['voc_dir'] = '../data/pascal/VOC2011' pylayer = 'VOCSegDataLayer' n.data, n.label = L.Python(module='voc_layers', layer=pylayer, ntop=2, param_str=str(pydata_params)) # the base net n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr = L.Convolution( n.drop7, num_output=21, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore2 = L.Deconvolution(n.score_fr, convolution_param=dict(num_output=21, kernel_size=4, stride=2, bias_term=False), param=[dict(lr_mult=0)]) # scale pool4 skip for compatibility n.scale_pool4 = L.Scale(n.pool4, filler=dict(type='constant', value=0.01), param=[dict(lr_mult=0)]) n.score_pool4 = L.Convolution( n.scale_pool4, num_output=21, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.score_pool4c = crop(n.score_pool4, n.upscore2) n.fuse_pool4 = L.Eltwise(n.upscore2, n.score_pool4c, operation=P.Eltwise.SUM) n.upscore_pool4 = L.Deconvolution(n.fuse_pool4, convolution_param=dict(num_output=21, kernel_size=4, stride=2, bias_term=False), param=[dict(lr_mult=0)]) # scale pool3 skip for compatibility n.scale_pool3 = L.Scale(n.pool3, filler=dict(type='constant', value=0.0001), param=[dict(lr_mult=0)]) n.score_pool3 = L.Convolution( n.scale_pool3, num_output=21, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.score_pool3c = crop(n.score_pool3, n.upscore_pool4) n.fuse_pool3 = L.Eltwise(n.upscore_pool4, n.score_pool3c, operation=P.Eltwise.SUM) n.upscore8 = L.Deconvolution(n.fuse_pool3, convolution_param=dict(num_output=21, kernel_size=16, stride=8, bias_term=False), param=[dict(lr_mult=0)]) n.score = crop(n.upscore8, n.data) n.loss = L.SoftmaxWithLoss(n.score, n.label, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
def generate(self): """Returns a NetSpec specifying CaffeNet, following the original proto text specification (./models/bvlc_reference_caffenet/train_val.prototxt).""" conf = self n = caffe.NetSpec() param = LT.learned_param if conf.train else LT.frozen_param if self.train: n.data = L.Python(top=[ "rois", 'labels', 'bbox_targets', 'bbox_inside_weights', 'bbox_outside_weights' ], python_param=dict(module='roi_data_layer.layer', layer='RoIDataLayer', param_str="num_classes: " + str(conf.num_classes))) else: n.data, n.im_info = LT.input() conv15_param = LT.learned_param if ( conf.conv_1_to_5_learn) else LT.frozen_param LT.conv1_to_5(n, conv15_param) if not (self.train): n.rpn_conv1, n.rpn_relu1, n.rpn_cls_score, n.rpn_bbox_pred = LT.rpn_class_and_bbox_predictors( n, self, param) n.rpn_cls_score_reshape = LT.reshape(n.rpn_cls_score, [0, 2, -1, 0]) n.rpn_cls_prob, n.rpn_cls_prob_reshape, n.rois = LT.roi_proposal( n, self) n.roi_pool = L.ROIPooling(bottom=["conv5", "rois"], pooled_w=6, pooled_h=6, spatial_scale=0.0625) n.fc6, n.relu6 = LT.fc_relu(n.roi_pool, 4096, param=param) n.drop6 = fc7input = L.Dropout(n.relu6, in_place=True, dropout_ratio=0.5, scale_train=False) n.fc7, n.relu7 = LT.fc_relu(fc7input, 4096, param=param) n.drop7 = layer7 = L.Dropout(n.relu7, in_place=True, dropout_ratio=0.5, scale_train=False) weight_filler = (LT.WEIGHT_FILLER if conf.train else dict()) bias_filler = (LT.BIAS_FILLER if conf.train else dict()) n.cls_score = L.InnerProduct(layer7, num_output=conf.num_classes, weight_filler=weight_filler, bias_filler=bias_filler, param=LT.learned_param) n.bbox_pred = L.InnerProduct(layer7, num_output=conf.num_classes * 4, weight_filler=weight_filler, bias_filler=bias_filler, param=LT.learned_param) if conf.train: n.loss_cls = LT.soft_max_with_loss(["cls_score", "labels"]) n.loss_bbox = L.SmoothL1Loss(bottom=[ "bbox_pred", "bbox_targets", "bbox_inside_weights", "bbox_outside_weights" ], loss_weight=1) else: n.cls_prob = L.Softmax(n.cls_score, loss_param=dict(ignore_label=-1, normalize=True)) if self.train: n.rpn_conv1, n.rpn_relu1, n.rpn_cls_score, n.rpn_bbox_pred = LT.rpn_class_and_bbox_predictors( n, self, LT.frozen_param) n.silence_rpn_cls_score = LT.silence(n.rpn_cls_score) n.silence_rpn_bbox_pred = LT.silence(n.rpn_bbox_pred) # write the net to a temporary file and return its filename return self.save(n)
def define_model(self): n = caffe.NetSpec() pylayer = 'ClsDataLayer' pydata_params = dict( phase='train', data_root=opt.cls_data_root, batch_size=16, ratio=5, augument=True, ) n.arch1_data, n.arch2_data, n.arch3_data, n.label = L.Python( module='data.ClsDataLayer', layer=pylayer, ntop=4, param_str=str(pydata_params)) n.arch1_conv1 = SingleConv(n.arch1_data, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch1_conv2 = SingleConv(n.arch1_conv1, 64, kernel_size=2, stride=2, padding=0) n.arch1_conv3 = SingleConv(n.arch1_conv2, 64, kernel_size=1, stride=1, padding=0) n.arch1_conv4 = SingleConv(n.arch1_conv3, 64, kernel_size=[2, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch1_conv5 = SingleConv(n.arch1_conv4, 64, kernel_size=[1, 4, 4], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch1_flat = L.Flatten(n.arch1_conv5) n.arch1_fc1 = L.InnerProduct(n.arch1_flat, num_output=150, weight_filler=dict(type='xavier')) n.fc1_act = L.ReLU(n.arch1_fc1, engine=3) n.arch1 = L.InnerProduct(n.fc1_act, num_output=2, weight_filler=dict(type='xavier')) n.arch1_loss = L.SoftmaxWithLoss(n.arch1, n.label) n.arch2_conv1 = SingleConv(n.arch2_data, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch2_conv2 = SingleConv(n.arch2_conv1, 64, kernel_size=2, stride=2, padding=0) n.arch2_conv3 = SingleConv(n.arch2_conv2, 64, kernel_size=[1, 2, 2], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch2_conv4 = SingleConv(n.arch2_conv3, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch2_conv5 = SingleConv(n.arch2_conv4, 64, kernel_size=[2, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch2_flat = L.Flatten(n.arch2_conv5) n.arch2_fc1 = L.InnerProduct(n.arch2_flat, num_output=250, weight_filler=dict(type='xavier')) n.fc2_act = L.ReLU(n.arch2_fc1, engine=3) n.arch2 = L.InnerProduct(n.fc2_act, num_output=2, weight_filler=dict(type='xavier')) n.arch2_loss = L.SoftmaxWithLoss(n.arch2, n.label) n.arch3_conv1 = SingleConv(n.arch3_data, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch3_conv2 = SingleConv(n.arch3_conv1, 64, kernel_size=2, stride=2, padding=0) n.arch3_conv3 = SingleConv(n.arch3_conv2, 64, kernel_size=[2, 2, 2], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch3_conv4 = SingleConv(n.arch3_conv3, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch3_conv5 = SingleConv(n.arch3_conv4, 64, kernel_size=[3, 5, 5], stride=[1, 1, 1], padding=[0, 0, 0]) n.arch3_flat = L.Flatten(n.arch3_conv5) n.arch3_fc1 = L.InnerProduct(n.arch3_flat, num_output=250, weight_filler=dict(type='xavier')) n.fc3_act = L.ReLU(n.arch3_fc1, engine=3) n.arch3 = L.InnerProduct(n.fc3_act, num_output=2, weight_filler=dict(type='xavier')) n.arch3_loss = L.SoftmaxWithLoss(n.arch3, n.label) with open(self.model_def, 'w') as f: f.write(str(n.to_proto()))
def fcn(split): n = caffe.NetSpec() n.data, n.sem, n.geo = L.Python( module='siftflow_layers', layer='SIFTFlowSegDataLayer', ntop=3, param_str=str( dict(siftflow_dir='/home/tramac/caffe/examples/fcn/data/sift-flow', split=split, seed=1337))) # the base net n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=100) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=7, pad=0) # fc6->fc6_new n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, pad=0) # fc7->fc7_new n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr_sem = L.Convolution( n.drop7, num_output=2, kernel_size=1, pad=0, # 33->2 param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore_sem = L.Deconvolution( n.score_fr_sem, convolution_param=dict( num_output=2, kernel_size=64, stride=32, ##change 33->2 bias_term=False), param=[dict(lr_mult=0)]) n.score_sem = crop(n.upscore_sem, n.data) # loss to make score happy (o.w. loss_sem) n.loss = L.SoftmaxWithLoss(n.score_sem, n.sem, loss_param=dict(normalize=False, ignore_label=255)) n.score_fr_geo = L.Convolution( n.drop7, num_output=2, kernel_size=1, pad=0, #3->2 param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore_geo = L.Deconvolution( n.score_fr_geo, convolution_param=dict( num_output=2, kernel_size=64, stride=32, # 3->2 bias_term=False), param=[dict(lr_mult=0)]) n.score_geo = crop(n.upscore_geo, n.data) n.loss_geo = L.SoftmaxWithLoss(n.score_geo, n.geo, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
def fcn(split): n = caffe.NetSpec() pydata_params = dict(split=split, mean=(104.00699, 116.66877, 122.67892), seed=1337) if split == 'train': pydata_params['sbdd_dir'] = '../data/sbdd/dataset' pylayer = 'SBDDSegDataLayer' else: pydata_params['voc_dir'] = '../data/pascal/VOC2011' pylayer = 'VOCSegDataLayer' n.data, n.label = L.Python(module='voc_layers', layer=pylayer, ntop=2, param_str=str(pydata_params)) # the base net n.conv1, n.relu1 = conv_relu(n.data, 96, ks=11, stride=4, pad=100) n.pool1 = max_pool(n.relu1) n.lrn1 = lrn(n.pool1) n.conv2, n.relu2 = conv_relu(n.lrn1, 128, ks=5, stride=1, pad=2) n.pool2 = max_pool(n.relu2) n.lrn2 = lrn(n.pool2) n.conv3, n.relu3 = conv_relu(n.lrn2, 384, ks=3, stride=1, pad=1) n.conv4, n.relu4 = conv_relu(n.relu3, 384, ks=3, stride=1, pad=1) n.conv5, n.relu5 = conv_relu(n.relu4, 256, ks=3, stride=1, pad=1) n.pool5 = max_pool(n.relu5) # fully conv n.fc6, n.relu6 = conv_relu(n.pool5, 4096, ks=6, stride=1, pad=0) n.drop6 = L.Dropout(n.relu6, dropout_ratio=0.5, in_place=True) n.fc7, n.relu7 = conv_relu(n.drop6, 4096, ks=1, stride=1, pad=0) n.drop7 = L.Dropout(n.relu7, dropout_ratio=0.5, in_place=True) n.score_fr = L.Convolution( n.drop7, num_output=21, kernel_size=1, pad=0, stride=1, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.upscore2 = L.Deconvolution(n.score_fr, convolution_param=dict(num_output=21, kernel_size=5, stride=2, bias_term=False), param=[dict(lr_mult=0)]) n.score_pool2 = L.Convolution( n.pool2, num_output=21, kernel_size=1, pad=0, param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=2, decay_mult=0)]) n.score_pool2c = crop(n.score_pool2, n.upscore2) n.fuse_pool2 = L.Eltwise(n.upscore2, n.score_pool2c, operation=P.Eltwise.SUM) n.upscore16 = L.Deconvolution(n.fuse_pool2, convolution_param=dict(num_output=21, kernel_size=31, stride=16, bias_term=False), param=[dict(lr_mult=0)]) n.score = crop(n.upscore16, n.data) n.loss = L.SoftmaxWithLoss(n.score, n.label, loss_param=dict(normalize=False, ignore_label=255)) return n.to_proto()
def net(split): n = caffe.NetSpec() if split == 'train': data_params = dict(mean=(104.00699, 116.66877, 122.67892)) data_params['root'] = './data/MSRA-B/' data_params['source'] = "train_list.txt" data_params['shuffle'] = True data_params['aug'] = args.aug data_params['ignore_label'] = -1 # ignore label n.data, n.label = L.Python(module='pylayer', layer='ImageLabelmapDataLayer', ntop=2, \ param_str=str(data_params)) loss_param = dict(normalize=args.lossnorm) if data_params.has_key('ignore_label'): loss_param['ignore_label'] = data_params['ignore_label'] elif split == 'test': n.data = L.Input(name='data', input_param=dict(shape=dict(dim=[1, 3, 500, 500]))) else: raise Exception("Invalid phase") n.conv1_1, n.relu1_1 = conv_relu(n.data, 64, pad=5) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512) n.pool5 = max_pool(n.relu5_3) n.pool5a = L.Pooling(n.pool5, pool=P.Pooling.AVE, kernel_size=3, stride=1, pad=1) ###DSN conv 6### n.conv1_dsn6, n.relu1_dsn6 = conv_relu(n.pool5a, 512, ks=7, pad=3) n.conv2_dsn6, n.relu2_dsn6 = conv_relu(n.relu1_dsn6, 512, ks=7, pad=3) n.conv3_dsn6 = conv1x1(n.relu2_dsn6, 'conv3_dsn6') n.score_dsn6_up = upsample(n.conv3_dsn6, stride=32, name='upsample32_in_dsn6') n.upscore_dsn6 = crop(n.score_dsn6_up, n.data) if split == 'train': n.sigmoid_dsn6 = L.Sigmoid(n.upscore_dsn6) floss_param = dict() floss_param['name'] = 'dsn6' floss_param['beta'] = args.beta n.loss_dsn6 = L.Python(n.sigmoid_dsn6, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn6 = L.Sigmoid(n.upscore_dsn6) ###DSN conv 5### n.conv1_dsn5, n.relu1_dsn5 = conv_relu(n.conv5_3, 512, ks=5, pad=2) n.conv2_dsn5, n.relu2_dsn5 = conv_relu(n.relu1_dsn5, 512, ks=5, pad=2) n.conv3_dsn5 = conv1x1(n.relu2_dsn5, 'conv3_dsn5') n.score_dsn5_up = upsample(n.conv3_dsn5, stride=16, name='upsample16_in_dsn5') n.upscore_dsn5 = crop(n.score_dsn5_up, n.data) if split == 'train': n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) floss_param['name'] = 'dsn5' floss_param['beta'] = args.beta n.loss_dsn5 = L.Python(n.sigmoid_dsn5, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn5 = L.Sigmoid(n.upscore_dsn5) ###DSN conv 4### n.conv1_dsn4, n.relu1_dsn4 = conv_relu(n.conv4_3, 256, ks=5, pad=2) n.conv2_dsn4, n.relu2_dsn4 = conv_relu(n.relu1_dsn4, 256, ks=5, pad=2) n.conv3_dsn4 = conv1x1(n.relu2_dsn4, 'conv3_dsn4') n.score_dsn6_up_4 = upsample(n.conv3_dsn6, stride=4, name='upsample4_dsn6') n.upscore_dsn6_4 = crop(n.score_dsn6_up_4, n.conv3_dsn4) n.score_dsn5_up_4 = upsample(n.conv3_dsn5, stride=2, name='upsample2_dsn5') n.upscore_dsn5_4 = crop(n.score_dsn5_up_4, n.conv3_dsn4) n.concat_dsn4 = L.Eltwise(n.conv3_dsn4, n.upscore_dsn6_4, n.upscore_dsn5_4, name="concat_dsn4") n.conv4_dsn4 = conv1x1(n.concat_dsn4, 'conv4_dsn4') n.score_dsn4_up = upsample(n.conv4_dsn4, stride=8, name='upsample8_in_dsn4') n.upscore_dsn4 = crop(n.score_dsn4_up, n.data) if split == 'train': n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) floss_param['name'] = 'dsn4' floss_param['beta'] = args.beta n.loss_dsn4 = L.Python(n.sigmoid_dsn4, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn4 = L.Sigmoid(n.upscore_dsn4) ### DSN conv 3 ### n.conv1_dsn3, n.relu1_dsn3 = conv_relu(n.conv3_3, 256, ks=5, pad=2) n.conv2_dsn3, n.relu2_dsn3 = conv_relu(n.relu1_dsn3, 256, ks=5, pad=2) n.conv3_dsn3 = conv1x1(n.relu2_dsn3, 'conv3_dsn3') n.score_dsn6_up_3 = upsample(n.conv3_dsn6, stride=8, name='upsample8_dsn6') n.upscore_dsn6_3 = crop(n.score_dsn6_up_3, n.conv3_dsn3) n.score_dsn5_up_3 = upsample(n.conv3_dsn5, stride=4, name='upsample4_dsn5') n.upscore_dsn5_3 = crop(n.score_dsn5_up_3, n.conv3_dsn3) n.concat_dsn3 = L.Eltwise(n.conv3_dsn3, n.upscore_dsn6_3, n.upscore_dsn5_3, name='concat') n.conv4_dsn3 = conv1x1(n.concat_dsn3, 'conv4_dsn3') n.score_dsn3_up = upsample(n.conv4_dsn3, stride=4, name='upsample4_in_dsn3') n.upscore_dsn3 = crop(n.score_dsn3_up, n.data) if split == 'train': n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) floss_param['name'] = 'dsn3' floss_param['beta'] = args.beta n.loss_dsn3 = L.Python(n.sigmoid_dsn3, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn3 = L.Sigmoid(n.upscore_dsn3) ### DSN conv 2 ### n.conv1_dsn2, n.relu1_dsn2 = conv_relu(n.conv2_2, 128, ks=3, pad=1) n.conv2_dsn2, n.relu2_dsn2 = conv_relu(n.relu1_dsn2, 128, ks=3, pad=1) n.conv3_dsn2 = conv1x1(n.relu2_dsn2, 'conv3_dsn2') n.score_dsn6_up_2 = upsample(n.conv3_dsn6, stride=16, name='upsample16_dsn6') n.upscore_dsn6_2 = crop(n.score_dsn6_up_2, n.conv3_dsn2) n.score_dsn5_up_2 = upsample(n.conv3_dsn5, stride=8, name='upsample8_dsn5') n.upscore_dsn5_2 = crop(n.score_dsn5_up_2, n.conv3_dsn2) n.score_dsn4_up_2 = upsample(n.conv4_dsn4, stride=4, name='upsample4_dsn4') n.upscore_dsn4_2 = crop(n.score_dsn4_up_2, n.conv3_dsn2) n.score_dsn3_up_2 = upsample(n.conv4_dsn3, stride=2, name='upsample2_dsn3') n.upscore_dsn3_2 = crop(n.score_dsn3_up_2, n.conv3_dsn2) n.concat_dsn2 = L.Eltwise(n.conv3_dsn2, n.upscore_dsn5_2, n.upscore_dsn4_2, n.upscore_dsn6_2, n.upscore_dsn3_2, name='concat') n.conv4_dsn2 = conv1x1(n.concat_dsn2, 'conv4_dsn2') n.score_dsn2_up = upsample(n.conv4_dsn2, stride=2, name='upsample2_in_dsn2') n.upscore_dsn2 = crop(n.score_dsn2_up, n.data) if split == 'train': n.sigmoid_dsn2 = L.Sigmoid(n.upscore_dsn2) floss_param['name'] = 'dsn2' floss_param['beta'] = args.beta n.loss_dsn2 = L.Python(n.sigmoid_dsn2, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn2 = L.Sigmoid(n.upscore_dsn2) ## DSN conv 1 ### n.conv1_dsn1, n.relu1_dsn1 = conv_relu(n.conv1_2, 128, ks=3, pad=1) n.conv2_dsn1, n.relu2_dsn1 = conv_relu(n.relu1_dsn1, 128, ks=3, pad=1) n.conv3_dsn1 = conv1x1(n.relu2_dsn1, 'conv3_dsn1') n.score_dsn6_up_1 = upsample(n.conv3_dsn6, stride=32, name='upsample32_dsn6') n.upscore_dsn6_1 = crop(n.score_dsn6_up_1, n.conv3_dsn1) n.score_dsn5_up_1 = upsample(n.conv3_dsn5, stride=16, name='upsample16_dsn5') n.upscore_dsn5_1 = crop(n.score_dsn5_up_1, n.conv3_dsn1) n.score_dsn4_up_1 = upsample(n.conv4_dsn4, stride=8, name='upsample8_dsn4') n.upscore_dsn4_1 = crop(n.score_dsn4_up_1, n.conv3_dsn1) n.score_dsn3_up_1 = upsample(n.conv4_dsn3, stride=4, name='upsample4_dsn3') n.upscore_dsn3_1 = crop(n.score_dsn3_up_1, n.conv3_dsn1) n.concat_dsn1 = L.Eltwise(n.conv3_dsn1, n.upscore_dsn5_1, n.upscore_dsn4_1, n.upscore_dsn6_1, n.upscore_dsn3_1, name='concat') n.score_dsn1_up = conv1x1(n.concat_dsn1, 'conv4_dsn1') n.upscore_dsn1 = crop(n.score_dsn1_up, n.data) if split == 'train': n.sigmoid_dsn1 = L.Sigmoid(n.upscore_dsn1) floss_param['name'] = 'dsn1' floss_param['beta'] = args.beta n.loss_dsn1 = L.Python(n.sigmoid_dsn1, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_dsn1 = L.Sigmoid(n.upscore_dsn1) ### Eltwise and multiscale weight layer ### n.concat_upscore = L.Eltwise(n.upscore_dsn1, n.upscore_dsn2, n.upscore_dsn3, n.upscore_dsn4, n.upscore_dsn5, n.upscore_dsn6, name='concat') n.upscore_fuse = conv1x1(n.concat_upscore, 'new_score_weighting', wf=dict({ 'type': 'constant', 'value': np.float(1) / 6 })) if split == 'train': n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) floss_param['name'] = 'fuse' floss_param['beta'] = args.beta n.loss_fuse = L.Python(n.sigmoid_fuse, n.label, module='floss', layer='FmeasureLossLayer', param_str=str(floss_param), ntop=1, loss_weight=1) else: n.sigmoid_fuse = L.Sigmoid(n.upscore_fuse) return n.to_proto()
def create_bnn_cnn_net_fold_stage(num_input_frames, fold_id='0', stage_id='1', phase=None): n = caffe.NetSpec() if phase == 'TRAIN': n.img, n.padimg, n.unary, n.in_features, n.out_features, n.spixel_indices, n.scales1, n.scales2, n.unary_scales, n.label = \ L.Python(python_param = dict(module = "input_data_layer", layer = "InputRead", param_str = "TRAIN_1000000_" + fold_id + '_' + stage_id), include = dict(phase = 0), ntop = 10) elif phase == 'TEST': n.img, n.padimg, n.unary, n.in_features, n.out_features, n.spixel_indices, n.scales1, n.scales2, n.unary_scales, n.label = \ L.Python(python_param = dict(module = "input_data_layer", layer = "InputRead", param_str = "VAL_50_" + fold_id + '_' + stage_id), include = dict(phase = 1), ntop = 10) else: n.img = L.Input(shape=[dict(dim=[1, 3, 480, 854])]) n.padimg = L.Input(shape=[dict(dim=[1, 3, 481, 857])]) n.unary = L.Input( shape=[dict(dim=[1, 2, num_input_frames, max_spixels])]) n.in_features = L.Input( shape=[dict(dim=[1, 6, num_input_frames, max_spixels])]) n.out_features = L.Input(shape=[dict(dim=[1, 6, 1, max_spixels])]) n.spixel_indices = L.Input(shape=[dict(dim=[1, 1, 480, 854])]) n.scales1 = L.Input(shape=[dict(dim=[1, 6, 1, 1])]) n.scales2 = L.Input(shape=[dict(dim=[1, 6, 1, 1])]) n.unary_scales = L.Input(shape=[dict(dim=[1, 1, num_input_frames, 1])]) n.flatten_scales1 = L.Flatten(n.scales1, flatten_param=dict(axis=0)) n.flatten_scales2 = L.Flatten(n.scales2, flatten_param=dict(axis=0)) n.flatten_unary_scales = L.Flatten(n.unary_scales, flatten_param=dict(axis=0)) n.in_scaled_features1 = L.Scale(n.in_features, n.flatten_scales1, scale_param=dict(axis=1)) n.out_scaled_features1 = L.Scale(n.out_features, n.flatten_scales1, scale_param=dict(axis=1)) n.in_scaled_features2 = L.Scale(n.in_features, n.flatten_scales2, scale_param=dict(axis=1)) n.out_scaled_features2 = L.Scale(n.out_features, n.flatten_scales2, scale_param=dict(axis=1)) n.scaled_unary = L.Scale(n.unary, n.flatten_unary_scales, scale_param=dict(axis=2)) ### Start of BNN # BNN - stage - 1 n.out_seg1 = L.Permutohedral(n.scaled_unary, n.in_scaled_features1, n.out_scaled_features1, permutohedral_param=dict( num_output=32, group=1, neighborhood_size=0, bias_term=True, norm_type=P.Permutohedral.AFTER, offset_type=P.Permutohedral.NONE), filter_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.out_seg2 = L.Permutohedral(n.scaled_unary, n.in_scaled_features2, n.out_scaled_features2, permutohedral_param=dict( num_output=32, group=1, neighborhood_size=0, bias_term=True, norm_type=P.Permutohedral.AFTER, offset_type=P.Permutohedral.NONE), filter_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.concat_out_seg_1 = L.Concat(n.out_seg1, n.out_seg2, concat_param=dict(axis=1)) n.concat_out_relu_1 = L.ReLU(n.concat_out_seg_1, in_place=True) # BNN - stage - 2 n.out_seg3 = L.Permutohedral(n.concat_out_relu_1, n.out_scaled_features1, n.out_scaled_features1, permutohedral_param=dict( num_output=32, group=1, neighborhood_size=0, bias_term=True, norm_type=P.Permutohedral.AFTER, offset_type=P.Permutohedral.NONE), filter_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.out_seg4 = L.Permutohedral(n.concat_out_relu_1, n.out_scaled_features2, n.out_scaled_features2, permutohedral_param=dict( num_output=32, group=1, neighborhood_size=0, bias_term=True, norm_type=P.Permutohedral.AFTER, offset_type=P.Permutohedral.NONE), filter_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.concat_out_seg_2 = L.Concat(n.out_seg3, n.out_seg4, concat_param=dict(axis=1)) n.concat_out_relu_2 = L.ReLU(n.concat_out_seg_2, in_place=True) # BNN - combination n.connection_out = L.Concat(n.concat_out_relu_1, n.concat_out_relu_2) n.spixel_out_seg = L.Convolution(n.connection_out, convolution_param=dict( num_output=2, kernel_size=1, stride=1, weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.spixel_out_seg_relu = L.ReLU(n.spixel_out_seg, in_place=True) # Going from superpixels to pixels n.out_seg_bilateral = L.Smear(n.spixel_out_seg_relu, n.spixel_indices) ### BNN - DeepLab Combination n.deeplab_seg_presoftmax = deeplab(n.padimg, n.img, n.spixel_indices) n.deeplab_seg = L.Softmax(n.deeplab_seg_presoftmax) n.bnn_deeplab_connection = L.Concat(n.out_seg_bilateral, n.deeplab_seg) n.bnn_deeplab_seg = L.Convolution(n.bnn_deeplab_connection, convolution_param=dict( num_output=2, kernel_size=1, stride=1, weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.bnn_deeplab_seg_relu = L.ReLU(n.bnn_deeplab_seg, in_place=True) ### Start of CNN # CNN - Stage 1 n.out_seg_spatial1 = L.Convolution(n.bnn_deeplab_seg_relu, convolution_param=dict( num_output=32, kernel_size=3, stride=1, pad_h=1, pad_w=1, weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.out_seg_spatial_relu1 = L.ReLU(n.out_seg_spatial1, in_place=True) # CNN - Stage 2 n.out_seg_spatial2 = L.Convolution(n.out_seg_spatial_relu1, convolution_param=dict( num_output=32, kernel_size=3, stride=1, pad_h=1, pad_w=1, weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0)), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) n.out_seg_spatial_relu2 = L.ReLU(n.out_seg_spatial2, in_place=True) # CNN - Stage 3 n.out_seg_spatial = L.Convolution(n.out_seg_spatial_relu2, convolution_param=dict( num_output=2, kernel_size=3, stride=1, pad_h=1, pad_w=1, weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0.5)), param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }]) # Normalization n.out_seg = normalize(n.out_seg_spatial, 2) if phase == 'TRAIN' or phase == 'TEST': n.loss = L.LossWithoutSoftmax(n.out_seg, n.label, loss_param=dict(ignore_label=1000), loss_weight=1) n.accuracy = L.Accuracy(n.out_seg, n.label, accuracy_param=dict(ignore_label=1000)) n.loss2 = L.SoftmaxWithLoss(n.deeplab_seg_presoftmax, n.label, loss_param=dict(ignore_label=1000), loss_weight=1) n.accuracy2 = L.Accuracy(n.deeplab_seg_presoftmax, n.label, accuracy_param=dict(ignore_label=1000)) else: n.spixel_out_seg_2 = L.SpixelFeature(n.out_seg, n.spixel_indices, spixel_feature_param=dict( type=P.SpixelFeature.AVGRGB, max_spixels=12000, rgb_scale=1.0)) n.spixel_out_seg_final = normalize(n.spixel_out_seg_2, 2) return n.to_proto()
def rpn(net, bottom, gt_boxes, im_info, data, anchors, feat_stride, scales, fixed=False, deploy=False): if not fixed: net["rpn_conv/3x3"] = L.Convolution(bottom, kernel_size=3, stride=1, num_output=512, pad=1, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) else: net["rpn_conv/3x3"] = L.Convolution(bottom, kernel_size=3, stride=1, num_output=512, pad=1, param=[{ 'lr_mult': 0 }, { 'lr_mult': 0 }], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) net["rpn_relu/3x3"] = L.ReLU(net["rpn_conv/3x3"], in_place=True) if not fixed: net["rpn_cls_score"] = L.Convolution(net["rpn_relu/3x3"], kernel_size=1, stride=1, num_output=2 * anchors, pad=0, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }], weight_filler=dict( type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) net["rpn_bbox_pred"] = L.Convolution(net["rpn_relu/3x3"], kernel_size=1, stride=1, num_output=4 * anchors, pad=0, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }], weight_filler=dict( type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) else: net["rpn_cls_score"] = L.Convolution(net["rpn_relu/3x3"], kernel_size=1, stride=1, num_output=2 * anchors, pad=0, param=[{ 'lr_mult': 0 }, { 'lr_mult': 0 }], weight_filler=dict( type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) net["rpn_bbox_pred"] = L.Convolution(net["rpn_relu/3x3"], kernel_size=1, stride=1, num_output=4 * anchors, pad=0, param=[{ 'lr_mult': 0 }, { 'lr_mult': 0 }], weight_filler=dict( type='gaussian', std=0.01), bias_filler=dict(type='constant', value=0), engine=2) net["rpn_cls_score_reshape"] = L.Reshape( net["rpn_cls_score"], reshape_param={"shape": { "dim": [0, 2, -1, 0] }}) if (not deploy) and (not fixed): net["rpn_labels"], net["rpn_bbox_targets"], net["rpn_bbox_inside_weights"], net[ "rpn_bbox_outside_weights"] = \ L.Python(net["rpn_cls_score"], gt_boxes, im_info, data, name='rpn-data', python_param=dict( module='rpn.anchor_target_layer', layer='AnchorTargetLayer', param_str='{"feat_stride": %s,"scales": %s}' % (feat_stride, scales)), # param_str='"feat_stride": %s \n "scales": !!python/tuple %s ' %(feat_stride, scales)), ntop=4, ) net["rpn_cls_loss"] = L.SoftmaxWithLoss(net["rpn_cls_score_reshape"], net["rpn_labels"], name="rpn_loss_cls", propagate_down=[1, 0], \ loss_weight=1, loss_param={"ignore_label": -1, "normalize": True}) net["rpn_loss_bbox"] = L.SmoothL1Loss(net["rpn_bbox_pred"], net["rpn_bbox_targets"], \ net["rpn_bbox_inside_weights"], net["rpn_bbox_outside_weights"], \ name="loss_bbox", loss_weight=1, smooth_l1_loss_param={"sigma": 3.0}) return net["rpn_cls_loss"], net["rpn_loss_bbox"], net[ "rpn_cls_score_reshape"], net["rpn_bbox_pred"] else: return net["rpn_cls_score_reshape"], net["rpn_bbox_pred"]