def main(input_path, output_path, clip_stride): model = C3D().to(device) model.load_state_dict(torch.load('c3d.pickle')) model.eval() if device.type == 'cuda': torch.backends.cudnn.benchmark = True model = torch.nn.DataParallel(model) train_list, test_list = generate_train_test_list(input_path, output_path) print('Done generating list') for row in train_list: source_path, output_folder, output_file = row print('Processing', source_path) if not os.path.exists(output_folder): os.makedirs(output_folder) if not os.path.exists(output_file): extract(model, source_path, output_file, clip_stride) for row in test_list: source_path, output_folder, output_file = row print('Processing', source_path) if not os.path.exists(output_folder): os.makedirs(output_folder) if not os.path.exists(output_file): extract(model, source_path, output_file, clip_stride)
def main(): """ Main function. """ # load a clip to be predicted X = get_sport_clip( 'roger') # batch size * channel * frames * height * width X = Variable(X) print(X.size()) X = X.cuda() # get network pretrained model net = C3D() net.load_state_dict(torch.load('c3d.pickle')) net.cuda() net.eval() print("create network") # perform prediction prediction = net(X) print(prediction.size()) print(prediction) prediction = prediction.data.cpu().numpy() print("prediction") # read labels labels = read_labels_from_file('labels.txt') # print top predictions top_inds = prediction[0].argsort( )[::-1][:5] # reverse sort and take five largest items print('\nTop 5:') for i in top_inds: print('{:.5f} {}'.format(prediction[0][i], labels[i]))
def main(): """ 主函数 """ # 载入视频片段做预测 X = get_sport_clip('roger') #roger视频 X = Variable(X) X = X.cuda() #GPU # 载入预训练好了的模型权重 net = C3D() #模型实例化 net.load_state_dict(torch.load('c3d.pickle')) #填入权重 net.cuda() net.eval() # 调到测试模式 # 然后直接拿网络预测就好 prediction = net(X) prediction = prediction.data.cpu().numpy() # 读入真实标签 labels = read_labels_from_file('labels.txt') # 得到topN的预测类别 top_inds = prediction[0].argsort()[::-1][:5] print('\nTop 5:') for i in top_inds: print('{:.5f} {}'.format(prediction[0][i], labels[i]))
def main(): #playsound('C:\\Users\\Pyo\\Desktop\\PSIML19\\GestureRecognition\\c3d\\fanfare_x.wav') net = C3D() net = NEW_model.newmodule(net) ###MOZDA???### # net.cuda() net.load_state_dict( torch.load('checkpoints\\adam10e6eps12regul01-epoch27_0.7245')) net.eval() camera(net)
def __init__(self, anchors, all_anchors, inds_inside): super(TPN, self).__init__() # init some para self.image_shape = [[240, 320] ] # for one batch, TODO: maybe need to change here self.anchors = anchors # (630, x, y, xw, yw) anchors coordinates self.inds_inside = inds_inside self.all_anchors = all_anchors # get C3D part, use pretrained weight c3d = C3D() c3d.load_state_dict(torch.load(c3d_checkpoint)) self.c3d_part1 = nn.Sequential(*list( c3d.modules())[1:4]) # be careful about these two indices # get conv2 self.c3d_part2 = nn.Sequential(*list(c3d.modules())[4:13]) # self.BN1 = torch.nn.BatchNorm2d(512) # # for RPN self._CPN = CPN(self.anchors, all_anchors, inds_inside) self.n_classes = 22 self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes) self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0 / 16.0) self.head_to_tail_ = torch.nn.Sequential( nn.Linear(512 * 7 * 7, 1024), # change from 4096 to 2048, for memory limit nn.ReLU(True), nn.Dropout(), nn.Linear(1024, 4096), # change from 4096 to 2048, for memory limit nn.ReLU(True)) self.RCNN_bbox_pred = torch.nn.Linear(4096, 4 * self.n_classes) self.RCNN_cls_score = torch.nn.Linear(4096, self.n_classes)
def train(): C3dNet = C3D() C3dNet.cuda() C3dNet.train() learning_rate = 0.01 optimizer = torch.optim.Adam(C3dNet.parameters(), lr=learning_rate) # optimizer = torch.optim.SGD(C3dNet.parameters(), lr=learning_rate, momentum=0.9) loss_func = torch.nn.CrossEntropyLoss() dset_train = ParkinsonDataset(data_type='train') train_loader = DataLoader(dset_train, batch_size=16, shuffle=True, num_workers=0) print("Training Data : ", len(train_loader.dataset)) print("training start!") for epoch in range(400): if epoch>0 and epoch % 100 ==0: learning_rate = learning_rate / 2 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate for batch_index, (data, label) in enumerate(train_loader): data, label = data.cuda(), label.cuda() # # # label = label.float() predict = C3dNet(data) # print("predict and label size: ", predict.size(), label.size()) loss = loss_func(predict, label) optimizer.zero_grad() loss.backward() optimizer.step() print("epoch: {}/399 | batch_index: {} | loss: {}".format(epoch, batch_index, loss.item())) if epoch > 0 and (epoch+1) % 100 == 0: torch.save(C3dNet.state_dict(), './weights/MyC3dNet{}.pth'.format(epoch+1))
def __init__(self, anchors, all_anchors, inds_inside): super(CPN, self).__init__() self.image_shape = [[240, 320] ] # for one batch, TODO: maybe need to change here self.anchors = anchors # (630, x, y, xw, yw) anchors coordinates self.inds_inside = inds_inside self.all_anchors = all_anchors self.nc_score_out = 2 * 12 # 2(bg/fg) * 12 (anchors) self.nc_bbox_out = 4 * 12 # 4(coords) * 12 (anchors) c3d = C3D() self.action_num = 22 #-1 # 21(classes) do not consider bg self.action_anchor_num = self.action_num * 12 c3d.load_state_dict(torch.load(c3d_checkpoint)) self.RPN_Conv = nn.Conv2d(512, 512, 3, 1, 1, bias=True) self.BN1 = nn.BatchNorm2d(512) self.RPN_cls_bbox_action = nn.Conv2d( 512, self.nc_score_out + self.nc_bbox_out + self.action_anchor_num, 1, 1, 0) self.BN2 = nn.BatchNorm2d(self.nc_score_out + self.nc_bbox_out + self.action_anchor_num) self.RPN_proposal = _ProposalLayer(self.anchors, self.all_anchors) self.RPN_anchor_target = _AnchorTargetLayer(self.anchors, self.inds_inside)
def main(): """ Main function. """ # load a clip to be predicted # X = get_sport_clip('roger') X = get_gesture_clip('c3d\\data\\3919') # X = torch.rand(size=(3,13,112,112)) X = Variable(X) X = X.cuda() # get network pretrained model net = C3D() net.load_state_dict(torch.load('c3d\\c3d.pickle')) # cast net to new net import NEW_model net = NEW_model.newmodule(net) # net.load_state_dict(torch.load('checkpoints\\adam10e6eps1220_0.6531')) net.cuda() # retrainovanje!!! from New_dataset import GesturesDataset from New_dataset import loadlabelsdict labelsdict = loadlabelsdict("c3d\\jester-v1-train.csv") dataloaders = {} dataset_sizes = {} trainset = GesturesDataset("c3d\\splittraindata\\train", labelsdict) validset = GesturesDataset("c3d\\splittraindata\\valid", labelsdict) dataloaders['train'] = torch.utils.data.DataLoader( trainset, batch_size=1, num_workers=2,shuffle=True) dataloaders['valid'] = torch.utils.data.DataLoader( validset, batch_size=1, num_workers=2,shuffle=True) dataset_sizes['train'] = len(trainset) dataset_sizes['valid'] = len(validset) import gesturetrain # print(trainset[0]) gesturetrain.train(net, dataloaders, dataset_sizes) import datetime t = datetime.datetime.now() # perform prediction net.eval() prediction = net(X) prediction = prediction.data.cpu().numpy() print("predict time: " + str(datetime.datetime.now()-t)) # print top predictions # reverse sort and take five largest items print(prediction) print("Predict treba da bude 2") print() top_inds = prediction[0].argsort()[::-1][:5] print('\nTop 5:') for i in top_inds: print('pred:{:.5f} label:{}'.format(prediction[0][i],i))
#--- create dataset folders # root folder path_output = args.data_path + args.feature_in + '_' + args.base_model + '/' if args.structure != 'tsn': path_output = args.data_path + args.feature_in + '-' + args.structure + '/' if not os.path.isdir(path_output): os.makedirs(path_output) ###### set up the model ###### # Load the pretrained model print(Fore.GREEN + 'Pre-trained model:', args.base_model) if args.base_model == 'c3d': from C3D_model import C3D c3d_clip_size = 16 model = C3D() model.load_state_dict(torch.load(args.pretrain_weight)) list_model = list(model.children()) list_conv = list_model[:-6] list_fc = list_model[-6:-4] extractor_conv = nn.Sequential(*list_conv) extractor_fc = nn.Sequential(*list_fc) # multi-gpu extractor_conv = torch.nn.DataParallel(extractor_conv.cuda()) extractor_conv.eval() extractor_fc = torch.nn.DataParallel(extractor_fc.cuda()) extractor_fc.eval() else:
beta = (.5, .999) # set gpu os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id # data load vol_names = [ filename for filename in os.listdir(data_dir) if int(filename.split("_")[-1].split(".")[0]) in normal ] vol_names.sort() vol_names = vol_names[:100] volumes, labels = data_load_ver2(vol_names, data_dir) # model model = C3D(pretrained=True) model.train() model.cuda() # set loss, optimizer BCE = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=lr, betas=beta) # training with two batch y = torch.tensor(data=np.array([[1.], [0.]]), dtype=torch.float32).cuda() for epoch in range(1, epoch_num + 1): total_loss = 0 pre_wrong = [] delay_wrong = [] correct = []
def main(): """ Main function. """ parser = argparse.ArgumentParser("C3D & ResNet feature extraction") parser.add_argument('-v', '--verbose', action='store_true', help="increase output verbosity") # ---------------------------------------------------------------------------------------------------------------- # parser.add_argument('--videos_root_local', type=str, default='/home/george/datasets/HMDB51', help="set videos root path") parser.add_argument('--videos_root_remote', type=str, default='/shared/datasets/HMDB51', help="set videos root path") parser.add_argument('--remote', action='store_true') parser.add_argument('--c3d_model_root', type=str, default='model', help="set C3D model root path") parser.add_argument('--video_list', type=str, default='video_HMDB51.list', help="set video list path") parser.add_argument( '--preds_c3d_root', type=str, default='preds_c3d_HMDB51', help="set video C3D predictions path, to store .npy files") parser.add_argument( '--preds_cnn_root', type=str, default='preds_cnn_HMDB51', help="set video CNN predictions path, to store .npy files") parser.add_argument('--c3d_batch_size', type=int, default=6, help="set C3D batch size") parser.add_argument('--cnn_batch_size', type=int, default=32, help="set CNN batch size") parser.add_argument('--batch', action='store_true') parser.set_defaults(batch=True) parser.add_argument('--gpu', type=int, default=0, help="set gpu id") parser.add_argument('--cuda', dest='cuda', action='store_true', help="use CUDA during training") parser.set_defaults(cuda=True) args = parser.parse_args() if args.batch == False: print( "Currently, you *have* to run this in batch mode, i.e. batch_size>1. Quitting..." ) quit() # if args.cuda: # os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) # if torch.cuda.is_available(): # print('Using CUDA device {}'.format(args.gpu)) if args.cuda: torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') if args.remote: args.videos_root = args.videos_root_remote else: args.videos_root = args.videos_root_local args.c3d_model_root = os.path.join(os.getcwd(), args.c3d_model_root) args.video_list = os.path.join(os.getcwd(), args.video_list) args.preds_c3d_root = os.path.join(os.getcwd(), args.preds_c3d_root) model_path = os.path.join(args.c3d_model_root, 'c3d.pickle') mean_path = os.path.join(args.c3d_model_root, 'c3d_mean.npy') labels = read_labels_from_file('labels_Sports-1M.txt') ############################################ # Load ResNet-50 resnet50_full = models.resnet50(pretrained=True) class ResNet50_FC(torch.nn.Module): def __init__(self): super(ResNet50_FC, self).__init__() self.features = torch.nn.Sequential( # stop at FC, to extract FC features, not class scores *list(resnet50_full.children())[:-1]) def forward(self, x): x = self.features(x) return x resnet50 = ResNet50_FC() if args.cuda == True: resnet50 = resnet50.cuda() resnet50.eval() ############################################ if not os.path.exists(args.c3d_model_root): os.mkdir(args.c3d_model_root) if not os.path.exists(model_path): model_url = 'http://imagelab.ing.unimore.it/files/c3d_pytorch/c3d.pickle' download_model_cmd = 'wget {} --directory-prefix {}'.format( model_url, args.c3d_model_root) os.system(download_model_cmd) if not os.path.exists(mean_path): mean_url = 'https://github.com/albertomontesg/keras-model-zoo/raw/master/kerasmodelzoo/data/c3d_mean.npy' download_mean_cmd = 'wget {} --directory-prefix {}'.format( mean_url, args.c3d_model_root) os.system(download_mean_cmd) if not os.path.exists(args.preds_c3d_root): os.mkdir(args.preds_c3d_root) if not os.path.exists(args.preds_cnn_root): os.mkdir(args.preds_cnn_root) C3D_STEP = 16 cnn_size = 224 mean_cube = np.load(mean_path) mean_cube = mean_cube[0] cnn_mean = np.array((0.485, 0.456, 0.406)) cnn_std = np.array((0.229, 0.224, 0.225)) cursor_up = "\x1b[1A" erase_line = "\x1b[1A" net = C3D() net.load_state_dict(torch.load(model_path)) if args.cuda == True: net = net.cuda() net.eval() if args.verbose: print('Reading video list') print('') print('') video_paths = [] file = open(args.video_list, 'r') for line in file: line = line.rstrip('\n') if args.videos_root != '': video_path = os.path.join(args.videos_root, line) else: video_path = line video_paths.append(video_path) vid_cnt = 0 N_vids = len(video_paths) for clip_path in video_paths: print(cursor_up + erase_line) video_name_with_ext = clip_path.split('/')[-1] video_name = video_name_with_ext.split('.')[0] preds_filename = video_name + '.npy' class_name = clip_path.split('/')[-2] class_name = class_name.replace(' ', '_') class_preds_c3d_folder = os.path.join(args.preds_c3d_root, class_name) if not os.path.exists(class_preds_c3d_folder): os.mkdir(class_preds_c3d_folder) class_preds_cnn_folder = os.path.join(args.preds_cnn_root, class_name) if not os.path.exists(class_preds_cnn_folder): os.mkdir(class_preds_cnn_folder) c3d_video_preds_path = os.path.join(class_preds_c3d_folder, preds_filename) cnn_video_preds_path = os.path.join(class_preds_cnn_folder, preds_filename) vid_cnt += 1 # In case that you're having problems with a specific video file, use something like this # if vid_cnt==3021: # continue print('{:04d}/{:04d} Processing video "{}"'.format( vid_cnt, len(video_paths), video_name)) if os.path.exists(c3d_video_preds_path) and os.path.exists( cnn_video_preds_path): continue print(' ') clip_full, clip_full_raw = get_np_clip(clip_path) N_frames = clip_full.shape[0] ######################################### # C3D feature extraction if not os.path.exists(c3d_video_preds_path): N_iters = int(np.ceil(N_frames / C3D_STEP)) features = [] frames_t = [] if args.batch: batch_cnt = 0 batch_clips = [] for t in range(0, N_iters): if t < (N_iters - 1): start_frame = t * C3D_STEP else: start_frame = N_frames - C3D_STEP batch_c3d_condition = ((N_iters + batch_cnt - t) >= args.c3d_batch_size) clip = clip_full[start_frame:(start_frame + C3D_STEP), :, :, :].copy() clip = preprocess_clip(clip, mean_cube, args.batch and batch_c3d_condition) if args.verbose: print(cursor_up + erase_line) print( 'Video {:07d}/{:07d} Frame {:07d}/{:07d} | {:02d}% | Using C3D for video "{}" | Batch: {}' .format(vid_cnt, N_vids, start_frame + 1, N_frames, int(100 * (start_frame / N_frames)), video_name, args.batch)) frames_t.append(start_frame) if (not args.batch) or (not batch_c3d_condition): #print('Gathering single clip') with torch.no_grad(): if args.cuda: X = Variable(clip.cuda()) probs, feats = net(X) feats_cpu = feats.data.cpu().numpy() features.append(feats_cpu[0]) elif batch_c3d_condition: batch_cnt += 1 batch_clips.append(clip) #print('Gathering video batch {}/{}'.format(batch_cnt, args.c3d_batch_size)) if batch_cnt == args.c3d_batch_size: clip = np.array(batch_clips) clip = torch.from_numpy(clip) with torch.no_grad(): X = Variable(clip) if args.cuda: X = X.cuda() probs, feats = net(X) feats_cpu = feats.data.cpu().numpy() batch_clips = [] for batch_iter in range(0, args.c3d_batch_size): features.append(feats_cpu[batch_iter]) batch_cnt = 0 clip = [] X = [] assert (len(features) == len(frames_t)) #print('C3D : gathered %d vectors in %d times' % ( len(features), len(frames_t) )) video_dict_c3d = {'features': features, 'frames_t': frames_t} np.save(c3d_video_preds_path, video_dict_c3d) ######################################### # CNN feature extraction if not os.path.exists(cnn_video_preds_path): if args.batch: batch_cnt = 0 batch_imgs = [] features = [] frames_t = [] for t in range(0, N_frames): frame_index = t batch_cnn_condition = ((N_frames + batch_cnt - t) >= args.cnn_batch_size) img = clip_full_raw[frame_index].copy() if args.verbose: print(cursor_up + erase_line) print( 'Video {:07d}/{:07d} Frame {:07d}/{:07d} | {:02d}% | Using ResNet for video "{}"' .format(vid_cnt, N_vids, frame_index + 1, N_frames, int(100 * (frame_index / N_frames)), video_name)) img = preprocess_img(img, cnn_size, args.batch and batch_cnn_condition) frames_t.append(frame_index) if (not args.batch) or (not batch_cnn_condition): #print('Gathering single image') img = img / 255.0 for ch_i in range(0, 3): img[0, ch_i, :, :] = img[0, ch_i, :, :] - cnn_mean[ch_i] img[0, ch_i, :, :] = img[0, ch_i, :, :] / cnn_std[ch_i] img = torch.from_numpy(img) with torch.no_grad(): X = Variable(img) if args.cuda: X = X.cuda() feats = resnet50(X) feats_cpu = feats.data.cpu().numpy() features.append(feats_cpu[0].flatten()) elif batch_cnn_condition: batch_cnt += 1 img = img / 255.0 for ch_i in range(0, 3): img[ch_i, :, :] = img[ch_i, :, :] - cnn_mean[ch_i] img[ch_i, :, :] = img[ch_i, :, :] / cnn_std[ch_i] batch_imgs.append(img) #print('Gathering image batch {}/{}'.format(batch_cnt, args.cnn_batch_size)) if batch_cnt == args.cnn_batch_size: img = np.array(batch_imgs) img = torch.from_numpy(img) with torch.no_grad(): X = Variable(img) if args.cuda: X = X.cuda() feats = resnet50(X) feats_cpu = feats.data.cpu().numpy() batch_imgs = [] for batch_iter in range(0, args.cnn_batch_size): features.append(feats_cpu[batch_iter].flatten()) batch_cnt = 0 img = [] X = [] assert (len(features) == len(frames_t)) #print('CNN : gathered %d vectors in %d times' % ( len(features), len(frames_t) )) video_dict_cnn = {'features': features, 'frames_t': frames_t} np.save(cnn_video_preds_path, video_dict_cnn)
return features # entry point if __name__ == '__main__': #load data features = np.load('sample.npy') labels = np.load('label.npy') #preprocess the features and labels features, labels = pre_treat(features, labels) features = filter_none_data(features) #change the features to appropriate type features = features.astype(np.int32) #N,C,H,W, actually frames as H features = features.reshape(-1, 2, 8, 15) #initial the net net = C3D() #assign the tunning parameters max_iterations = 200 batch_size = 15 learning_rate = 0.0005 #train the net parameters training(net, features, labels, max_iterations, batch_size, learning_rate)
def _prepare_DA(self, num_class, base_model): # convert the model to DA framework if base_model == 'c3d': # C3D mode: in construction... from C3D_model import C3D model_test = C3D() self.feature_dim = model_test.fc7.in_features elif base_model == 'i3d': from dataset_preparation.pytorch_i3d import InceptionI3d as I3D model_test = I3D() self.feature_dim = model_test.logits.conv3d.in_channels else: model_test = getattr(torchvision.models, base_model)(True) # model_test is only used for getting the dim # # pdb.set_trace() self.feature_dim = model_test.fc.in_features std = 0.001 feat_shared_dim = min(self.fc_dim, self.feature_dim) if self.add_fc > 0 and self.fc_dim > 0 else self.feature_dim feat_frame_dim = feat_shared_dim self.relu = nn.ReLU(inplace=True) self.dropout_i = nn.Dropout(p=self.dropout_rate_i) self.dropout_v = nn.Dropout(p=self.dropout_rate_v) #------ frame-level layers (shared layers + source layers + domain layers) ------# if self.add_fc < 1: raise ValueError(Back.RED + 'add at least one fc layer') # 1. shared feature layers self.fc_feature_shared_source = nn.Linear(self.feature_dim, feat_shared_dim) normal_(self.fc_feature_shared_source.weight, 0, std) constant_(self.fc_feature_shared_source.bias, 0) if self.add_fc > 1: self.fc_feature_shared_2_source = nn.Linear(feat_shared_dim, feat_shared_dim) normal_(self.fc_feature_shared_2_source.weight, 0, std) constant_(self.fc_feature_shared_2_source.bias, 0) if self.add_fc > 2: self.fc_feature_shared_3_source = nn.Linear(feat_shared_dim, feat_shared_dim) normal_(self.fc_feature_shared_3_source.weight, 0, std) constant_(self.fc_feature_shared_3_source.bias, 0) # 2. frame-level feature layers self.fc_feature_source = nn.Linear(feat_shared_dim, feat_frame_dim) normal_(self.fc_feature_source.weight, 0, std) constant_(self.fc_feature_source.bias, 0) # 3. domain feature layers (frame-level) self.fc_feature_domain = nn.Linear(feat_shared_dim, feat_frame_dim) normal_(self.fc_feature_domain.weight, 0, std) constant_(self.fc_feature_domain.bias, 0) # 4. classifiers (frame-level) self.fc_classifier_source = nn.Linear(feat_frame_dim, num_class) normal_(self.fc_classifier_source.weight, 0, std) constant_(self.fc_classifier_source.bias, 0) self.fc_classifier_domain = nn.Linear(feat_frame_dim, 2) normal_(self.fc_classifier_domain.weight, 0, std) constant_(self.fc_classifier_domain.bias, 0) if self.share_params == 'N': self.fc_feature_shared_target = nn.Linear(self.feature_dim, feat_shared_dim) normal_(self.fc_feature_shared_target.weight, 0, std) constant_(self.fc_feature_shared_target.bias, 0) if self.add_fc > 1: self.fc_feature_shared_2_target = nn.Linear(feat_shared_dim, feat_shared_dim) normal_(self.fc_feature_shared_2_target.weight, 0, std) constant_(self.fc_feature_shared_2_target.bias, 0) if self.add_fc > 2: self.fc_feature_shared_3_target = nn.Linear(feat_shared_dim, feat_shared_dim) normal_(self.fc_feature_shared_3_target.weight, 0, std) constant_(self.fc_feature_shared_3_target.bias, 0) self.fc_feature_target = nn.Linear(feat_shared_dim, feat_frame_dim) normal_(self.fc_feature_target.weight, 0, std) constant_(self.fc_feature_target.bias, 0) self.fc_classifier_target = nn.Linear(feat_frame_dim, num_class) normal_(self.fc_classifier_target.weight, 0, std) constant_(self.fc_classifier_target.bias, 0) # BN for the above layers if self.use_bn != 'none': # S & T: use AdaBN (ICLRW 2017) approach self.bn_shared_S = nn.BatchNorm1d(feat_shared_dim) # BN for the shared layers self.bn_shared_T = nn.BatchNorm1d(feat_shared_dim) self.bn_source_S = nn.BatchNorm1d(feat_frame_dim) # BN for the source feature layers self.bn_source_T = nn.BatchNorm1d(feat_frame_dim) #------ aggregate frame-based features (frame feature --> video feature) ------# if self.frame_aggregation == 'rnn': # 2. rnn self.hidden_dim = feat_frame_dim if self.rnn_cell == 'LSTM': self.rnn = nn.LSTM(feat_frame_dim, self.hidden_dim//self.n_directions, self.n_layers, batch_first=True, bidirectional=bool(int(self.n_directions/2))) elif self.rnn_cell == 'GRU': self.rnn = nn.GRU(feat_frame_dim, self.hidden_dim//self.n_directions, self.n_layers, batch_first=True, bidirectional=bool(int(self.n_directions/2))) # initialization for p in range(self.n_layers): kaiming_normal_(self.rnn.all_weights[p][0]) kaiming_normal_(self.rnn.all_weights[p][1]) self.bn_before_rnn = nn.BatchNorm2d(1) self.bn_after_rnn = nn.BatchNorm2d(1) elif self.frame_aggregation == 'trn': # 4. TRN (ECCV 2018) ==> fix segment # for both train/val self.num_bottleneck = 512 self.TRN = TRNmodule.RelationModule(feat_shared_dim, self.num_bottleneck, self.train_segments) self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck) self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck) elif self.frame_aggregation == 'trn-m': # 4. TRN (ECCV 2018) ==> fix segment # for both train/val self.num_bottleneck = 256 self.TRN = TRNmodule.RelationModuleMultiScale(feat_shared_dim, self.num_bottleneck, self.train_segments) self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck) self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck) elif self.frame_aggregation == 'temconv': # 3. temconv self.tcl_3_1 = TCL(3, 1) self.tcl_5_1 = TCL(5, 1) self.bn_1_S = nn.BatchNorm1d(feat_frame_dim) self.bn_1_T = nn.BatchNorm1d(feat_frame_dim) self.tcl_3_2 = TCL(3, 1) self.tcl_5_2 = TCL(5, 2) self.bn_2_S = nn.BatchNorm1d(feat_frame_dim) self.bn_2_T = nn.BatchNorm1d(feat_frame_dim) self.conv_fusion = nn.Sequential( nn.Conv2d(2, 1, kernel_size=(1, 1), padding=(0, 0)), nn.ReLU(inplace=True), ) # ------ video-level layers (source layers + domain layers) ------# if self.frame_aggregation == 'avgpool': # 1. avgpool feat_aggregated_dim = feat_shared_dim if 'trn' in self.frame_aggregation : # 4. trn feat_aggregated_dim = self.num_bottleneck elif self.frame_aggregation == 'rnn': # 2. rnn feat_aggregated_dim = self.hidden_dim elif self.frame_aggregation == 'temconv': # 3. temconv feat_aggregated_dim = feat_shared_dim feat_video_dim = feat_aggregated_dim # 1. source feature layers (video-level) self.fc_feature_video_source = nn.Linear(feat_aggregated_dim, feat_video_dim) normal_(self.fc_feature_video_source.weight, 0, std) constant_(self.fc_feature_video_source.bias, 0) self.fc_feature_video_source_2 = nn.Linear(feat_video_dim, feat_video_dim) normal_(self.fc_feature_video_source_2.weight, 0, std) constant_(self.fc_feature_video_source_2.bias, 0) # 2. domain feature layers (video-level) self.fc_feature_domain_video = nn.Linear(feat_aggregated_dim, feat_video_dim) normal_(self.fc_feature_domain_video.weight, 0, std) constant_(self.fc_feature_domain_video.bias, 0) # 3. classifiers (video-level) self.fc_classifier_video_source = nn.Linear(feat_video_dim, num_class) normal_(self.fc_classifier_video_source.weight, 0, std) constant_(self.fc_classifier_video_source.bias, 0) if self.ens_DA == 'MCD': self.fc_classifier_video_source_2 = nn.Linear(feat_video_dim, num_class) # second classifier for self-ensembling normal_(self.fc_classifier_video_source_2.weight, 0, std) constant_(self.fc_classifier_video_source_2.bias, 0) self.fc_classifier_domain_video = nn.Linear(feat_video_dim, 2) normal_(self.fc_classifier_domain_video.weight, 0, std) constant_(self.fc_classifier_domain_video.bias, 0) # domain classifier for TRN-M if self.frame_aggregation == 'trn-m': self.relation_domain_classifier_all = nn.ModuleList() for i in range(self.train_segments-1): relation_domain_classifier = nn.Sequential( nn.Linear(feat_aggregated_dim, feat_video_dim), nn.ReLU(), nn.Linear(feat_video_dim, 2) ) self.relation_domain_classifier_all += [relation_domain_classifier] if self.share_params == 'N': self.fc_feature_video_target = nn.Linear(feat_aggregated_dim, feat_video_dim) normal_(self.fc_feature_video_target.weight, 0, std) constant_(self.fc_feature_video_target.bias, 0) self.fc_feature_video_target_2 = nn.Linear(feat_video_dim, feat_video_dim) normal_(self.fc_feature_video_target_2.weight, 0, std) constant_(self.fc_feature_video_target_2.bias, 0) self.fc_classifier_video_target = nn.Linear(feat_video_dim, num_class) normal_(self.fc_classifier_video_target.weight, 0, std) constant_(self.fc_classifier_video_target.bias, 0) # BN for the above layers if self.use_bn != 'none': # S & T: use AdaBN (ICLRW 2017) approach self.bn_source_video_S = nn.BatchNorm1d(feat_video_dim) self.bn_source_video_T = nn.BatchNorm1d(feat_video_dim) self.bn_source_video_2_S = nn.BatchNorm1d(feat_video_dim) self.bn_source_video_2_T = nn.BatchNorm1d(feat_video_dim) self.alpha = torch.ones(1) if self.use_bn == 'AutoDIAL': self.alpha = nn.Parameter(self.alpha) # ------ attention mechanism ------# # conventional attention if self.use_attn == 'general': self.attn_layer = nn.Sequential( nn.Linear(feat_aggregated_dim, feat_aggregated_dim), nn.Tanh(), nn.Linear(feat_aggregated_dim, 1) )