class ExtructFeatrue(object): def __init__(self): saved = torch.load( '/mnt/workspace/model/activitynet_clip_kinetics600_dpn107_rgb_model/activitynet_clip_600_dpn107_rgb_model_best_074.pth.tar' ) self.model = TSN(201, 3, 'RGB', 'dpn107', 1) self.train_augmentation = self.model.get_augmentation() self.input_mean = self.model.input_mean self.input_std = self.model.input_std self.softmax = nn.Softmax(dim=-1).cuda() self.model = nn.DataParallel(self.model) self.model.load_state_dict(saved['state_dict']) self.base_model = nn.DataParallel(self.model.module.base_model).cuda() self.new_fc = nn.DataParallel(self.model.module.new_fc).cuda() self.model.eval() self.base_model.eval() self.new_fc.eval() def loadFeatrue(self, x): midfeature = self.base_model(x) classfeature = self.softmax(self.new_fc(midfeature)) return midfeature, classfeature
def main(): args = parser.parse_args() if args.dataset == 'ucf101': args.num_class = 101 elif args.dataset == 'hmdb51': args.num_class = 51 elif args.dataset == 'kinetics': args.num_class = 400 else: raise ValueError('Unknown dataset ' + args.dataset) place = fluid.CUDAPlace(0) with fluid.dygraph.guard(place): model = TSN(args.num_class, args.num_segments, args.modality, args.arch, dropout=0) args.short_size = model.scale_size args.target_size = model.crop_size args.input_mean = model.input_mean args.input_std = model.input_std * 3 state_dict = fluid.dygraph.load_dygraph(args.load_path)[0] model.set_dict(state_dict) test_reader = KineticsReader('test', args, args.test_list).create_reader() log = open(args.log_path, 'w') model.eval() avg_acc = AverageMeter() for batch_id, data in enumerate(test_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = model(img, label) avg_acc.update(acc.numpy()[0], label.shape[0]) if (batch_id + 1) % args.print_freq == 0: output = 'Test batch_id:{} | acc {} | avg acc:{}'.format( batch_id + 1, acc.numpy()[0], avg_acc.avg) print(output) log.write(output + '\n') log.flush() output = 'Test Avg acc:{}'.format(avg_acc.avg) print(output) log.write(output + '\n') log.flush() log.close()
def eval(args): # parse config config = parse_config(args.config) val_config = merge_configs(config, 'valid', vars(args)) # print_configs(val_config, "Valid") with fluid.dygraph.guard(): val_model = TSN(args.batch_size, 32) label_dic = np.load('work/UCF-101_jpg/label_dir.npy', allow_pickle=True).item() label_dic = {v: k for k, v in label_dic.items()} # get infer reader val_reader = UCFReader('valid', val_config).create_reader() # if no weight files specified, exit() if args.weights: weights = args.weights else: print("model path must be specified") exit() para_state_dict, _ = fluid.load_dygraph(weights) val_model.load_dict(para_state_dict) val_model.eval() acc_list = [] for batch_id, data in enumerate(val_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label = fluid.layers.reshape(label, [args.batch_size, 1]) label.stop_gradient = True out, acc = val_model(img, label) print('batch_id=', batch_id, 'acc=', acc.numpy()) acc_list.append(acc.numpy()) print("验证集准确率为:{}".format(np.mean(acc_list)))
def infer(args): # parse config config = parse_config(args.config) infer_config = merge_configs(config, 'infer', vars(args)) # print_configs(infer_config, "Infer") with fluid.dygraph.guard(): infer_model = TSN(args.batch_size, 32) label_dic = np.load('work/UCF-101_jpg/label_dir.npy', allow_pickle=True).item() label_dic = {v: k for k, v in label_dic.items()} # get infer reader infer_reader = UCFReader('infer', infer_config).create_reader() # if no weight files specified, exit() if args.weights: weights = args.weights else: print("model path must be specified") exit() para_state_dict, _ = fluid.load_dygraph(weights) # print('para_state_dict:', para_state_dict) infer_model.load_dict(para_state_dict) infer_model.eval() for batch_id, data in enumerate(infer_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]) img = fluid.dygraph.to_variable(dy_x_data) out = infer_model(img) label_id = fluid.layers.argmax(out, axis=1).numpy()[0] print("实际标签{}, 预测结果{}".format(y_data, label_dic[label_id]))
if os.path.isfile(args.resume): print(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] net.load_state_dict(checkpoint['state_dict']) print(("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch']))) else: print(("=> no checkpoint found at '{}'".format(args.resume))) # ToDo: why # if len(devices) > 1: # cause bug # device = torch.device('cuda:{}'.format(devices[0])) # net = net.to(device) net.eval() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def eval_video(video_data): i, data, label = video_data num_crop = args.test_crops if args.modality == 'RGB': length = 3 elif args.modality == 'Flow': length = 10
div=(args.arch not in ['BNInception', 'InceptionV3'])), GroupNormalize(originalNet.input_mean, originalNet.input_std), ])), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=False) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) #net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) originalNet = torch.nn.DataParallel(originalNet.cuda()) originalNet.eval() exit() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def eval_video(video_data): i, data, label = video_data num_crop = args.test_crops if "RGBFlow" == 'RGB': length = 3 elif "RGBFlow" == 'Flow': length = 10
def main(): global args, best_prec1 num_class = 4 rgb_read_format = "{:d}.jpg" model = TSN(num_class, args.num_segments, args.pretrained_parts, 'RGB', base_model='ECO', consensus_type='identity', dropout=0.3, partial_bn=True) crop_size = model.crop_size scale_size = model.scale_size input_mean = model.input_mean input_std = model.input_std # Optimizer s also support specifying per-parameter options. # To do this, pass in an iterable of dict s. # Each of them will define a separate parameter group, # and should contain a params key, containing a list of parameters belonging to it. # Other keys should match the keyword arguments accepted by the optimizers, # and will be used as optimization options for this group. policies = model.get_optim_policies() train_augmentation = model.get_augmentation() model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda() model_dict = model.state_dict() print("pretrained_parts: ", args.pretrained_parts) model_dir = args.model_path new_state_dict = torch.load(model_dir)['state_dict'] un_init_dict_keys = [ k for k in model_dict.keys() if k not in new_state_dict ] print("un_init_dict_keys: ", un_init_dict_keys) print("\n------------------------------------") for k in un_init_dict_keys: new_state_dict[k] = torch.DoubleTensor(model_dict[k].size()).zero_() if 'weight' in k: if 'bn' in k: print("{} init as: 1".format(k)) constant_(new_state_dict[k], 1) else: print("{} init as: xavier".format(k)) xavier_uniform_(new_state_dict[k]) elif 'bias' in k: print("{} init as: 0".format(k)) constant_(new_state_dict[k], 0) print("------------------------------------") model.load_state_dict(new_state_dict) cudnn.benchmark = True # Data loading code normalize = GroupNormalize(input_mean, input_std) data_length = 1 val_loader = torch.utils.data.DataLoader(TSNDataSet( "", args.val_list, num_segments=args.num_segments, new_length=data_length, modality='RGB', image_tmpl=rgb_read_format, random_shift=False, transform=torchvision.transforms.Compose([ GroupScale(int(scale_size)), GroupCenterCrop(crop_size), Stack(roll=True), ToTorchFormatTensor(div=False), normalize, ])), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) for group in policies: print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) model.eval() for i, (input, target) in enumerate(val_loader): target = target.cuda() input_var = input target_var = target output = model(input_var) _, pred = output.data.topk(1, 1, True, True) print(pred, target) print('done')
def get_pred(video_path, caption_path, opt): # options parser = argparse.ArgumentParser( description="TRN testing on the full validation set") # parser.add_argument('dataset', type=str, choices=['something','jester','moments','charades']) # parser.add_argument('modality', type=str, choices=['RGB', 'Flow', 'RGBDiff']) parser.add_argument('--dataset', type=str, default='somethingv2') parser.add_argument('--modality', type=str, default='RGB') parser.add_argument( '--weights', type=str, default= 'model/TRN_somethingv2_RGB_BNInception_TRNmultiscale_segment8_best.pth.tar' ) parser.add_argument('--arch', type=str, default="BNInception") parser.add_argument('--save_scores', type=str, default=None) parser.add_argument('--test_segments', type=int, default=8) parser.add_argument('--max_num', type=int, default=-1) parser.add_argument('--test_crops', type=int, default=10) parser.add_argument('--input_size', type=int, default=224) parser.add_argument('--crop_fusion_type', type=str, default='TRNmultiscale', choices=['avg', 'TRN', 'TRNmultiscale']) parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--gpus', nargs='+', type=int, default=None) parser.add_argument('--img_feature_dim', type=int, default=256) parser.add_argument( '--num_set_segments', type=int, default=1, help='TODO: select multiply set of n-frames from a video') parser.add_argument('--softmax', type=int, default=0) args = parser.parse_args() def accuracy(output, target, topk=(1, )): """Computes the precision@k for the specified values of k""" maxk = max(topk) batch_size = target.size(0) prob, pred = output.topk(maxk, 1, True, True) prob = prob.t().data.numpy().squeeze() pred = pred.t().data.numpy().squeeze() return prob, pred categories, args.train_list, args.val_list, args.root_path, prefix = datasets_video.return_dataset( args.dataset, args.modality, opt) num_class = len(categories) net = TSN(num_class, args.test_segments if args.crop_fusion_type in ['TRN', 'TRNmultiscale'] else 1, args.modality, base_model=args.arch, consensus_type=args.crop_fusion_type, img_feature_dim=args.img_feature_dim, opt=opt) try: checkpoint = torch.load(args.weights) except: args.weights = os.path.join(opt.project_root, 'scripts/Eval/', args.weights) checkpoint = torch.load(args.weights) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported while we got {}".format( args.test_crops)) data_loader = torch.utils.data.DataLoader(TSNDataSet( video_path, caption_path, num_segments=args.test_segments, new_length=1 if args.modality == "RGB" else 5, modality=args.modality, image_tmpl=prefix, test_mode=True, transform=torchvision.transforms.Compose([ cropping, Stack(roll=(args.arch in ['BNInception', 'InceptionV3'])), ToTorchFormatTensor( div=(args.arch not in ['BNInception', 'InceptionV3'])), GroupNormalize(net.input_mean, net.input_std), ])), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) #net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net = torch.nn.DataParallel(net.cuda()) net.eval() data_gen = enumerate(data_loader) output = [] def eval_video(video_data): i, data, label = video_data num_crop = args.test_crops if args.modality == 'RGB': length = 3 elif args.modality == 'Flow': length = 10 elif args.modality == 'RGBDiff': length = 18 else: raise ValueError("Unknown modality " + args.modality) input_var = torch.autograd.Variable(data.view(-1, length, data.size(2), data.size(3)), volatile=True) rst = net(input_var) if args.softmax == 1: # take the softmax to normalize the output to probability rst = F.softmax(rst) rst = rst.data.cpu().numpy().copy() if args.crop_fusion_type in ['TRN', 'TRNmultiscale']: rst = rst.reshape(-1, 1, num_class) else: rst = rst.reshape((num_crop, args.test_segments, num_class)).mean(axis=0).reshape( (args.test_segments, 1, num_class)) return i, rst, label[0] max_num = args.max_num if args.max_num > 0 else len(data_loader.dataset) prob_all, pred_all = [], [] for i, (data, label) in data_gen: if i >= max_num: break rst = eval_video((i, data, label)) output.append(rst[1:]) prob, pred = accuracy(torch.from_numpy(np.mean(rst[1], axis=0)), label, topk=(1, 174)) prob_all.append(prob) pred_all.append(pred) return prob_all, pred_all
def main(conf, test_set, test_part=-1): gulp_path = os.path.join(conf.gulp_test_dir, conf.modality.lower(), 'test', test_set) gulp_path = os.path.realpath(gulp_path) gulp_path = Path(gulp_path) classes_map = pickle.load(open(conf.classes_map, "rb")) conf.num_classes = count_num_classes(classes_map) net = TSN(conf.num_classes, 1, conf.modality, base_model=conf.arch, consensus_type=conf.crop_fusion_type, dropout=conf.dropout) checkpoint = torch.load(conf.weights) print("Model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if conf.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif conf.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported while we got {}".format( conf.test_crops)) class_type = 'verb+noun' if conf.class_type == 'action' else conf.class_type if conf.modality == 'Flow': dataset = EpicVideoFlowDataset(gulp_path=gulp_path, class_type=class_type) else: dataset = EpicVideoDataset(gulp_path=gulp_path, class_type=class_type) data_loader = torch.utils.data.DataLoader(EpicTSNTestDataset( dataset, classes_map, num_segments=conf.test_segments, new_length=1 if conf.modality == "RGB" else 5, modality=conf.modality, transform=torchvision.transforms.Compose([ cropping, Stack(roll=conf.arch == 'BNInception'), ToTorchFormatTensor(div=conf.arch != 'BNInception'), GroupNormalize(net.input_mean, net.input_std), ]), part=test_part), batch_size=1, shuffle=False, num_workers=conf.workers * 2, pin_memory=True) net = torch.nn.DataParallel(net, device_ids=conf.gpus).cuda() net.eval() total_num = len(data_loader.dataset) output = [] proc_start_time = time.time() for i, (keys, input_) in enumerate(data_loader): rst = eval_video(conf, (i, keys, input_), net) output.append(rst[1:]) cnt_time = time.time() - proc_start_time print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_index = [x[0] for x in output] scores = [x[1] for x in output] save_scores = './{}/tsn_{}_{}_testset_{}_{}_lr_{}_model_{:03d}.npz'.format( conf.checkpoint, conf.class_type, conf.modality.lower(), test_set, conf.arch, conf.lr, checkpoint['epoch']) if test_part > 0: save_scores = save_scores.replace('.npz', '_part-{}.npz'.format(test_part)) np.savez(save_scores, segment_indices=video_index, scores=scores)
def eval_one_model(num_class, modality, weights, devices, args): # init model net = TSN(num_class, 1, modality, base_model=args.arch, consensus_type=args.crop_fusion_type, dropout=args.dropout, mdl=args.mdl, pretrained=False) # load checkpoint checkpoint = torch.load(weights) print("model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = checkpoint['state_dict'] # base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} net.load_state_dict(base_dict) # transformer if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported while we got {}".format( args.test_crops)) # prepare dataset if args.dataset == 'ucf101': naming_pattern = "frame{:06d}.jpg" if modality in [ "RGB", "RGBDiff", 'tvl1' ] else args.flow_prefix + "{}_{:06d}.jpg" else: naming_pattern = "image_{:05d}.jpg" if modality in [ "RGB", "RGBDiff" ] else args.flow_prefix + "{}_{:05d}.jpg" data_loader = torch.utils.data.DataLoader(TSNDataSet( os.path.join(args.data_root_path, ('jpegs_256' if modality == 'RGB' else 'tvl1_flow')), args.test_list, num_segments=args.test_segments, new_length=4 if modality == "RGB" else 6, modality=modality, image_tmpl=naming_pattern, test_mode=True, dataset=args.dataset, transform=torchvision.transforms.Compose([ cropping, Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), GroupNormalize(net.input_mean, net.input_std), ])), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) data_gen = iter(data_loader) total_num = len(data_loader.dataset) output = [] # [class probability, label code] # Inferencing net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() max_num = len(data_loader.dataset) for i in tqdm(range(max_num)): data, label = next(data_gen) if i >= max_num: break output.append( eval_video(net, (i, data, label), num_class, modality, args)) video_pred = [np.argmax(np.mean(x[1], axis=0)) for x in output] video_labels = [x[2] for x in output] # summarize results cf = confusion_matrix(video_labels, video_pred).astype(float) cls_cnt = cf.sum(axis=1) cls_hit = np.diag(cf) cls_acc = cls_hit / cls_cnt print('Accuracy of {}, {:.02f}%'.format(modality, np.mean(cls_acc) * 100)) del net del data_loader class_acc_map = class_acc_mapping(cls_acc, args.dataset) return output, video_labels, class_acc_map
cropping, Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), GroupNormalize(net.input_mean, net.input_std), ])), batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices) net.eval() data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def eval_video(video_data): i, data, label = video_data num_crop = args.test_crops if args.modality == 'RGB': length = 3 elif args.modality == 'Flow': length = 10