def __init__(self, nr_classes): super(FBSparseVGGTest, self).__init__() self.sparseModel = scn.SparseVggNet( 2, nInputPlanes=2, layers=[['C', 16], ['C', 16], 'MP', ['C', 32], ['C', 32], 'MP', ['C', 64], ['C', 64], 'MP', ['C', 128], ['C', 128], 'MP', ['C', 256], ['C', 256], 'MP', ['C', 512]]).add( scn.Convolution(2, 512, 256, 3, filter_stride=2, bias=False)).add( scn.BatchNormReLU(256)).add( scn.SparseToDense(2, 256)) cnn_spatial_output_size = [2, 3] self.spatial_size = self.sparseModel.input_spatial_size( torch.LongTensor(cnn_spatial_output_size)) self.inputLayer = scn.InputLayer(dimension=2, spatial_size=self.spatial_size, mode=2) self.linear_input_features = cnn_spatial_output_size[ 0] * cnn_spatial_output_size[1] * 256 self.linear = nn.Linear(self.linear_input_features, nr_classes)
def __init__(self): nn.Module.__init__(self) self.sparseModel = scn.SparseVggNet( 2, 3, [['C', 16], ['C', 16], 'MP', ['C', 32], ['C', 32], 'MP', ['C', 48], ['C', 48], 'MP', ['C', 64], ['C', 64], 'MP', ['C', 96], ['C', 96] ]).add(scn.Convolution(2, 96, 128, 3, 2, False)).add( scn.BatchNormReLU(128)).add(scn.SparseToDense(2, 128)) self.linear = nn.Linear(128, 3755)
def __init__(self): nn.Module.__init__(self) self.sparseModel = scn.SparseVggNet(2, 3, [ ['C', 8, ], ['C', 8], 'MP', ['C', 16], ['C', 16], 'MP', ['C', 16 + 8], ['C', 16 + 8], 'MP', ['C', 24 + 8], ['C', 24 + 8], 'MP'] ).add(scn.Convolution(2, 32, 64, 5, 1, False) ).add(scn.BatchNormReLU(64) ).add(scn.SparseToDense(2, 64)) self.linear = nn.Linear(64, 183)
def __init__(self): nn.Module.__init__(self) self.sparseModel = scn.SparseVggNet( 2, 3, [['C', 16], ['C', 16], 'MP', ['C', 32], ['C', 32], 'MP', ['C', 48], ['C', 48], 'MP', ['C', 64], ['C', 64], 'MP', ['C', 96], ['C', 96] ]).add(scn.Convolution(2, 96, 128, 3, 2, False)).add( scn.BatchNormReLU(128)).add(scn.SparseToDense(2, 128)) self.spatial_size = self.sparseModel.input_spatial_size( torch.LongTensor([1, 1])) self.inputLayer = scn.InputLayer(2, self.spatial_size, 2) self.linear = nn.Linear(128, 3755)
def __init__(self, dimension=3, device='cuda'): nn.Module.__init__(self) self.sparseModel = scn.Sequential().add( scn.SparseVggNet( dimension, 1, [['C', 8], ['C', 8], ['MP', 3, 2], ['C', 16], ['C', 16], ['MP', 3, 2], ['C', 24], ['C', 24], ['MP', 3, 2]])).add( scn.SubmanifoldConvolution( dimension, 24, 32, 3, False)).add(scn.BatchNormReLU(32)).add( scn.SparseToDense(dimension, 32)).to(device) self.spatial_size = self.sparseModel.input_spatial_size( torch.LongTensor([1] * dimension)) self.inputLayer = scn.InputLayer(dimension, self.spatial_size)
def __init__(self): nn.Module.__init__(self) self.sparseModel = scn.Sequential( scn.SparseVggNet( 2, 3, [[ 'C', 8, ], ['C', 8], 'MP', ['C', 16], ['C', 16], 'MP', ['C', 16, 8], ['C', 16, 8], 'MP', ['C', 24, 8], ['C', 24, 8], 'MP']), scn.Convolution(2, 32, 64, 5, 1, False), scn.BatchNormReLU(64), scn.SparseToDense(2, 64)) self.spatial_size = self.sparseModel.input_spatial_size( torch.LongTensor([1, 1])) self.inputLayer = scn.InputLayer(2, self.spatial_size, 2) self.linear = nn.Linear(64, 183)
def __init__(self, nr_classes, nr_box=2, nr_input_channels=2, small_out_map=True): super(FBSparseObjectDet, self).__init__() self.nr_classes = nr_classes self.nr_box = nr_box sparse_out_channels = 256 self.sparseModel = scn.SparseVggNet( 2, nInputPlanes=nr_input_channels, layers=[['C', 16], ['C', 16], 'MP', ['C', 32], ['C', 32], 'MP', ['C', 64], ['C', 64], 'MP', ['C', 128], ['C', 128], 'MP', ['C', 256], ['C', 256]]).add( scn.Convolution( 2, 256, sparse_out_channels, 3, filter_stride=2, bias=False)).add( scn.BatchNormReLU(sparse_out_channels)).add( scn.SparseToDense(2, sparse_out_channels)) if small_out_map: self.cnn_spatial_output_size = [5, 7] else: self.cnn_spatial_output_size = [6, 8] spatial_size_product = self.cnn_spatial_output_size[ 0] * self.cnn_spatial_output_size[1] self.spatial_size = self.sparseModel.input_spatial_size( torch.LongTensor(self.cnn_spatial_output_size)) self.inputLayer = scn.InputLayer(dimension=2, spatial_size=self.spatial_size, mode=2) self.linear_input_features = spatial_size_product * 256 self.linear_1 = nn.Linear(self.linear_input_features, 1024) self.linear_2 = nn.Linear( 1024, spatial_size_product * (nr_classes + 5 * self.nr_box))
# Copyright 2016-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch import sparseconvnet as scn # Use the GPU if there is one, otherwise CPU use_gpu = torch.cuda.is_available() model = scn.Sequential().add( scn.SparseVggNet(2, 1, [['C', 8], ['C', 8], ['MP', 3, 2], ['C', 16], ['C', 16], ['MP', 3, 2], ['C', 24], ['C', 24], ['MP', 3, 2]]) ).add( scn.ValidConvolution(2, 24, 32, 3, False) ).add( scn.BatchNormReLU(32) ).add( scn.SparseToDense(2,32) ) if use_gpu: model.cuda() # output will be 10x10 inputSpatialSize = model.input_spatial_size(torch.LongTensor([10, 10])) input = scn.InputBatch(2, inputSpatialSize)
# Copyright 2016-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch import sparseconvnet as scn # Use the GPU if there is one, otherwise CPU use_gpu = torch.cuda.is_available() model = scn.Sequential().add( scn.SparseVggNet(2, 1, [['C', 8], ['C', 8], ['MP', 3, 2], ['C', 16], ['C', 16], ['MP', 3, 2], ['C', 24], ['C', 24], ['MP', 3, 2] ])).add(scn.ValidConvolution(2, 24, 32, 3, False)).add( scn.BatchNormReLU(32)).add(scn.SparseToDense(2, 32)) if use_gpu: model.cuda() # output will be 10x10 inputSpatialSize = model.input_spatial_size(torch.LongTensor([10, 10])) input = scn.InputBatch(2, inputSpatialSize) msg = [ " X X XXX X X XX X X XX XXX X XXX ", " X X X X X X X X X X X X X X X X ", " XXXXX XX X X X X X X X X X XXX X X X ", " X X X X X X X X X X X X X X X X X X ", " X X XXX XXX XXX XX X X XX X X XXX XXX "
import torch import sparseconvnet as scn # Use the GPU if there is one, otherwise CPU use_gpu = torch.cuda.is_available() model = scn.Sequential().add( scn.SparseVggNet(dimension=2, nInputPlanes=1, layers=[['C', 8], ['C', 8], ['MP', 3, 2], ['C', 16], ['C', 16], ['MP', 3, 2], ['C', 24], ['C', 24], ['MP', 3, 2]])).add( scn.SubmanifoldConvolution( dimension=2, nIn=24, nOut=32, filter_size=3, bias=False)).add( scn.BatchNormReLU(nPlanes=32)).add( scn.SparseToDense(dimension=2, nPlanes=32)) if use_gpu: model.cuda() # output will be 10x10, calculate the input size inputSpatialSize = model.input_spatial_size(torch.LongTensor([10, 10])) print(inputSpatialSize) # (87, 87) #input = scn.InputBatch(dimension=2, spatial_size=inputSpatialSize) input_scn = scn.InputLayer(dimension=2, spatial_size=inputSpatialSize) msg = [
def __init__(self, args, device): super(MME2E_Sparse, self).__init__() self.args = args self.device = device self.num_classes = args['num_emotions'] self.mod = args['modalities'].lower() self.feature_dim = args['feature_dim'] self.threshold = args['sparse_threshold'] nlayers = args['trans_nlayers'] nheads = args['trans_nheads'] trans_dim = args['trans_dim'] text_cls_dim = 768 if args['text_model_size'] == 'large': text_cls_dim = 1024 if args['text_model_size'] == 'xlarge': text_cls_dim = 2048 # Textual self.T = MME2E_T(feature_dim=self.feature_dim, size=args['text_model_size']) # Visual self.mtcnn = MTCNN(image_size=48, margin=2, post_process=False, device=device) self.normalize = transforms.Normalize(mean=[159, 111, 102], std=[37, 33, 32]) self.V = nn.ModuleDict({ 'low_level': nn.Sequential( nn.Conv2d(in_channels=3, out_channels=64, kernel_size=5, padding=2), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), VggBasicBlock(in_planes=64, out_planes=64), VggBasicBlock(in_planes=64, out_planes=64)), 'sparse_layers': nn.ModuleList([ scn.Sequential( # 第一个 2,指的是 2D scn.SparseVggNet(2, 64, [['C', 128], ['C', 128], ['MP', 2, 2]])), scn.Sequential( scn.SparseVggNet(2, 128, [['C', 256], ['C', 256], ['MP', 2, 2]])), scn.Sequential( scn.SparseVggNet(2, 256, [['C', 512], ['C', 512], ['MP', 2, 2]]), scn.SparseToDense(2, 512)) ]), 'attn_layers': nn.ModuleList([ CrossModalAttentionLayer(k=64, x_channels=64, y_size=text_cls_dim, spatial=True), SparseCrossModalAttentionLayer( k=128, x_channels=128, y_size=text_cls_dim, sparse_threshold=self.threshold), SparseCrossModalAttentionLayer(k=256, x_channels=256, y_size=text_cls_dim, sparse_threshold=self.threshold) ]) }) self.A = nn.ModuleDict({ 'low_level': nn.Sequential( nn.Conv2d(in_channels=1, out_channels=64, kernel_size=5, padding=2), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), VggBasicBlock(in_planes=64, out_planes=64), VggBasicBlock(in_planes=64, out_planes=64)), 'sparse_layers': nn.ModuleList([ scn.Sequential().add( scn.SparseVggNet(2, 64, [['C', 128], ['C', 128], ['MP', 2, 2]])), scn.Sequential().add( scn.SparseVggNet(2, 128, [['C', 256], ['C', 256], ['MP', 2, 2]])), scn.Sequential().add( scn.SparseVggNet(2, 256, [['C', 512], ['C', 512], ['MP', 2, 2] ])).add(scn.SparseToDense(2, 512)) ]), 'attn_layers': nn.ModuleList([ CrossModalAttentionLayer(k=64, x_channels=64, y_size=text_cls_dim, spatial=True), SparseCrossModalAttentionLayer( k=128, x_channels=128, y_size=text_cls_dim, sparse_threshold=self.threshold), SparseCrossModalAttentionLayer(k=256, x_channels=256, y_size=text_cls_dim, sparse_threshold=self.threshold) ]) }) self.v_sparse_input_layers = nn.ModuleList([ scn.InputLayer( 2, self.V['sparse_layers'][0].input_spatial_size( torch.LongTensor([12, 12]))), scn.InputLayer( 2, self.V['sparse_layers'][1].input_spatial_size( torch.LongTensor([6, 6]))), scn.InputLayer( 2, self.V['sparse_layers'][2].input_spatial_size( torch.LongTensor([3, 3]))) ]) self.a_sparse_input_layers = nn.ModuleList([ scn.InputLayer( 2, self.A['sparse_layers'][0].input_spatial_size( torch.LongTensor([32, 8]))), scn.InputLayer( 2, self.A['sparse_layers'][1].input_spatial_size( torch.LongTensor([16, 4]))), scn.InputLayer( 2, self.A['sparse_layers'][2].input_spatial_size( torch.LongTensor([8, 2]))) ]) self.v_flatten = nn.Sequential(nn.Linear(512 * 3 * 3, 1024), nn.ReLU(), nn.Linear(1024, trans_dim)) self.a_flatten = nn.Sequential(nn.Linear(512 * 8 * 2, 1024), nn.ReLU(), nn.Linear(1024, trans_dim)) self.v_transformer = WrappedTransformerEncoder(dim=trans_dim, num_layers=nlayers, num_heads=nheads) self.a_transformer = WrappedTransformerEncoder(dim=trans_dim, num_layers=nlayers, num_heads=nheads) # Output layers self.t_out = nn.Linear(text_cls_dim, self.num_classes) self.v_out = nn.Linear(trans_dim, self.num_classes) self.a_out = nn.Linear(trans_dim, self.num_classes) self.weighted_fusion = nn.Linear(len(self.mod), 1, bias=False)
#['MP',3,2], #['C',512], #['C',512], #['MP',3,2] ] default_layers = [['C', 8], ['C', 8], ['MP', 3, 2], ['C', 16], ['C', 16], ['MP', 3, 2], ['C', 24], ['C', 24], ['MP', 3, 2]] model = scn.Sequential().add( scn.SparseVggNet(2, 1, vgg_layers ) #).add( #scn.SubmanifoldConvolution(2, 24, 32, 3, False) #).add( # scn.BatchNormReLU(32) ).add( scn.SparseToDense(2, 32) ).to(device) print(model) par_dict = {} for par in model.named_parameters(): print("--------------------------------------------") print("setting [",par[0],par[1].shape,"]") if "weight" in par[0] and len(par[0].split("."))==3 and len(par[1].shape)==4: layerid1 = int(par[0].split(".")[0])