elif model == 'RGBDiff': input = data.view(-1, 18, data.size(1), data.size(2)) output =torch.cat((pre_scoresRGBDiff,model_RGBDiff(input)),0) pre_scoresRGBDiff = output.data[-(args.num_segments - args.delta):,] output_tensor = output.data.mean(dim = 0,keepdim=True) return output_tensor if args.dataset == 'ucf101': num_class = 101 else: raise ValueError('Unkown dataset: ' + args.dataset) model_RGB = TSN_model(num_class, 1, 'RGB', base_model_name=args.arch, consensus_type='avg', dropout=args.dropout) model_RGBDiff = TSN_model(num_class, 1, 'RGBDiff', base_model_name=args.arch, consensus_type='avg', dropout=args.dropout) for i in range(len(args.weights)): #load the weights of your model training checkpoint = torch.load(args.weights[i]) print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1'])) base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} if i==0: model_RGB.load_state_dict(base_dict) else: model_RGBDiff.load_state_dict(base_dict)
def First_step(): #num_crop = args.test_crops test_segments = args.test_segments num_crop = args.test_crops #this function do forward propagation and returns scores def eval_video(data, model): """ Evaluate single video video_data : Tuple has 3 elments (data in shape (crop_number,num_segments*length,H,W), label) return : predictions and labels """ global pre_scoresRGB global pre_scoresRGBDiff with torch.no_grad(): #reshape data to be in shape of (num_segments*crop_number,length,H,W) #Forword Propagation if model == 'RGB': input = data.view(-1, 3, data.size(1), data.size(2)) output = torch.cat((pre_scoresRGB, model_RGB(input))) pre_scoresRGB = output.data[-3:,] elif model == 'RGBDiff': input = data.view(-1, 18, data.size(1), data.size(2)) output = torch.cat((pre_scoresRGBDiff, model_RGBDiff(input))) pre_scoresRGBDiff = output.data[-3:,] output_np = output.data.cpu().numpy().copy() #Reshape numpy array to (num_crop,num_segments,num_classes) output_np = output_np.reshape((num_crop, test_segments*2, num_class)) #Take mean of cropped images to be in shape (num_segments,1,num_classes) output_np = output_np.mean(axis=0).reshape((test_segments*2,1,num_class)) output_np = output_np.mean(axis=0) return output_np if args.dataset == 'ucf101': num_class = 101 else: raise ValueError('Unkown dataset: ' + args.dataset) model_RGB = TSN_model(num_class, 1, 'RGB', base_model_name=args.arch, consensus_type='avg', dropout=args.dropout) model_RGBDiff = TSN_model(num_class, 1, 'RGBDiff', base_model_name=args.arch, consensus_type='avg', dropout=args.dropout) for i in range(len(args.weights)): #load the weights of your model training checkpoint = torch.load(args.weights[i]) print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1'])) base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} if i==0: model_RGB.load_state_dict(base_dict) else: model_RGBDiff.load_state_dict(base_dict) cropping = torchvision.transforms.Compose([ GroupScale(model_RGB.scale_size), GroupCenterCrop(model_RGB.input_size), ]) #Required transformations transform = torchvision.transforms.Compose([ cropping, Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), GroupNormalize(model_RGB.input_mean, model_RGB.input_std), ]) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) model_RGB = torch.nn.DataParallel(model_RGB.cuda(devices[0]), device_ids=devices) model_RGBDiff = torch.nn.DataParallel(model_RGBDiff.cuda(devices[0]), device_ids=devices) model_RGB.eval() model_RGBDiff.eval() softmax = torch.nn.Softmax() scores = torch.tensor(np.zeros((1,101)), dtype=torch.float32).cuda() frames = [] action_checker=True conn,transport = set_server(ip="0.0.0.0",port=args.port,Tunnel=True,n_conn=2,hostname= args.hostname) if conn is None: return try: top5_actions = Top_N(args.classInd_file) rcv_frames = rcv_frames_thread(connection=conn[0]) send_results = send_results_thread(connection=conn[1],test=args.test) while (rcv_frames.isAlive() and send_results.isAlive()): if rcv_frames.CheckReset(): frames = [] rcv_frames.ConfirmReset() frame,status = rcv_frames.get() if frame is 0: break if frame is None: continue frame = Image.fromarray(frame) frames.append(frame) if len(frames) == test_segments*6: frames = transform(frames).cuda() scores_RGB = eval_video(frames[0:len(frames):6], 'RGB') scores_RGBDiff = eval_video(frames[:], 'RGBDiff') final_scores = args.score_weights[0]*scores_RGB + args.score_weights[1] * scores_RGBDiff #final_scores = softmax(torch.FloatTensor(final_scores)) #final_scores = final_scores.data.cpu().numpy().copy() #five_scores = np.argsort(final_scores)[0][::-1][:5] top5_actions.import_scores(final_scores[0,]) indices_TopN,_,scores_TopN = top5_actions.get_top_N_actions() action_checker = Evaluation(scores_TopN, args.psi) send_results.put(status=status,scores=(*indices_TopN,*scores_TopN),Actf=action_checker) frames = [] else: send_results.put(status=status,Actf=action_checker) except (KeyboardInterrupt,IOError,OSError): pass finally: rcv_frames.close() send_results.close() conn[0].close() conn[1].close() if bool(transport): transport.close()
parser.add_argument('--gpus', nargs='+', type=int, default=None) parser.add_argument('--flow_prefix', type=str, default='') parser.add_argument('--classInd_file', type=str, default='') args = parser.parse_args() if args.dataset == 'ucf101': num_class = 101 else: raise ValueError('Unkown dataset: ' + args.dataset) #later, it will define number of segments=25 for each video, so #number of segments is set to 1 here to take 25 snippets from each video. model = TSN_model(num_class, 1, args.modality, base_model_name=args.arch, consensus_type=args.crop_fusion_type, dropout=args.dropout) #load the weights from the file saved during training process. #args.weights is simply a string refers to the path of the file. checkpoint = torch.load(args.weights) print("epoch {}, best acc1@: {}".format(checkpoint['epoch'], checkpoint['best_acc1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } model.load_state_dict(base_dict)
def main(): global args, best_acc1 args = parser.parse_args() if args.dataset == 'ucf101': num_classes = 101 else: raise ValueError('Unknown dataset: ' + args.dataset) model = TSN_model(num_classes, args.num_segments, args.modality, base_model_name=args.arch, consensus_type=args.consensus_type, dropout=args.dropout, partial_bn=not args.no_partialbn, KinWeights=args.KinWeights) crop_size = model.crop_size scale_size = model.scale_size input_mean = model.input_mean input_std = model.input_std policies = model.get_optim_policies() train_augmentation = model.get_augmentation() #to use multiple GPUs, args.gpus is a list (e.g. to use 4 GPUs, device_ids=[0,1,2,3]). model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() #args.resume is an empty string to provide the path to the latest checkpoint. if args.resume: #if there is a file, do the following: if os.path.isfile(args.resume): print(("Loading checkpoint '{}'".format(args.resume))) #load the parameters. checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) print(("Loaded checkpoint '{}' epoch {}".format( args.evaluate, checkpoint['epoch']))) else: print(("No checkpoint found at '{}'".format(args.resume))) #this flag allows you to enable the inbuilt cudnn auto-tuner to find #the best algorithm to use for your hardware. #but if the input sizes change at each iteration, this will lead to worse runtime. cudnn.benchmark = True #different techniques of normalization is used for RGB & RGB Difference. ######## if args.modality != 'RGBDiff': normalize = GroupNormalize(input_mean, input_std) else: normalize = IdentityTransform() #for RGB, we only take 1 frame per segment. #for RGBDiff, we take 5 consecutive frames per segment. if args.modality == 'RGB': data_length = 1 else: data_length = 5 #load the data using built-in PyTorch function torch.utils.data.DataLoader. ########## train_loader = torch.utils.data.DataLoader( TSNDataset( args.train_list, num_segments=args.num_segments, new_length=data_length, modality=args.modality, image_prefix='frame{:06d}.jpg', transform=torchvision.transforms.Compose([ train_augmentation, Stack(roll=args.arch == 'BNInception'), ######### #convert RGB image with (H x W x C) to tensor of shape (C x H x W). #from range [0, 255] to [0 1] ToTorchFormatTensor(div=args.arch != 'BNInception'), normalize, ])), # how many subprocesses to use for data loading. 0 means that the data will be loaded #in the main process. # Having more workers will increase the memory usage.WARNING alot of workers #with larg batch size will cosume all the ram. #Optimal value of workers is the number of cpu cores as each core is responsable #to deliver one of batches. #4 or 8 would be ok. more will distract and consume the cpu. batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(TSNDataset( args.val_list, num_segments=args.num_segments, new_length=data_length, modality=args.modality, image_prefix='frame{:06d}.jpg', train_val_switch=False, transform=torchvision.transforms.Compose([ GroupScale(int(scale_size)), GroupCenterCrop(crop_size), Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) #only one loss type if defined (CrossEntropy). if args.loss_type == 'nll': criterion = torch.nn.CrossEntropyLoss().cuda() else: raise ValueError("Unkown loss type") print('---------------------------------------------------') for group in policies: print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format( group['name'], len(group['params']), group['lr_mult'], group['decay_mult']))) print('---------------------------------------------------') #Stochastic Gradient Decent. optimizer = torch.optim.SGD(policies, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: validate(val_loader, model, criterion, 0) #this is used for the same reason as break in loops. return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr_steps) #train for one epoch train(train_loader, model, criterion, optimizer, epoch) #evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: acc1 = validate(val_loader, model, criterion, (epoch + 1) * len(train_loader)) # remember best acc@1 and save checkpoint. is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, }, is_best)
def First_step(): num_crop = args.test_crops test_segments = args.test_segments #this function do forward propagation and returns scores def eval_video(data): """ Evaluate single video video_data : Tuple has 3 elments (data in shape (crop_number,num_segments*length,H,W), label) return : predictions and labels """ if args.modality == 'RGB': length = 3 elif args.modality == 'RGBDiff': length = 18 else: raise ValueError("Unknown modality " + args.modality) with torch.no_grad(): #reshape data to be in shape of (num_segments*crop_number,length,H,W) input = data.view(-1, length, data.size(1), data.size(2)) #Forword Propagation output = model(input) output_np = output.data.cpu().numpy().copy() #Reshape numpy array to (num_crop,num_segments,num_classes) output_np = output_np.reshape((num_crop, test_segments, num_class)) #Take mean of cropped images to be in shape (num_segments,1,num_classes) output_np = output_np.mean(axis=0).reshape((test_segments,1,num_class)) output_np = output_np.mean(axis=0) return output_np action_label = label_dic(args.classInd_file) if args.dataset == 'ucf101': num_class = 101 else: raise ValueError('Unkown dataset: ' + args.dataset) model = TSN_model(num_class, 1, args.modality, base_model_name=args.arch, consensus_type='avg', dropout=args.dropout) #load the weights of your model training checkpoint = torch.load(args.weights) print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1'])) base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} model.load_state_dict(base_dict) #test_crops is set to 1 for fast video evaluation if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(model.scale_size), GroupCenterCrop(model.input_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(model.input_size, model.scale_size) ]) else: raise ValueError("Only 1 and 10 crops are supported while we got {}".format(test_crops)) #Required transformations transform = torchvision.transforms.Compose([ cropping, Stack(roll=args.arch == 'BNInception'), ToTorchFormatTensor(div=args.arch != 'BNInception'), GroupNormalize(model.input_mean, model.input_std), ]) if args.gpus is not None: devices = [args.gpus[i] for i in range(args.workers)] else: devices = list(range(args.workers)) model = torch.nn.DataParallel(model.cuda(devices[0]), device_ids=devices) model.eval() softmax = torch.nn.Softmax() scores = torch.tensor(np.zeros((1,101)), dtype=torch.float32).cuda() frames = [] frame_count = 0 try: top5_actions = Top_N(args.classInd_file) Tunnel_ = True conn,T_thr = Network.set_server(port=6666,Tunnel=Tunnel_,n=1) rcv_frames = Streaming.rcv_frames_thread(connection=conn[0]) send_results = Streaming.send_results_thread(connection=conn[1]) while (rcv_frames.isAlive() and send_results.isAlive()): frame,status = rcv_frames.get() if frame is 0: break frame_count += 1 frame = Image.fromarray(frame) if args.modality == 'RGB': frames.append(frame) if frame_count % 5 == 0 and frame_count != 0: frames = transform(frames).cuda() scores = eval_video(frames) scores = softmax(torch.FloatTensor(scores)) scores = scores.data.cpu().numpy().copy() top5_actions.import_scores(scores[0,]) indecies,_,scores = top5_actions.get_top_N_actions() send_results.put(status=status,scores=(*indecies,*scores)) frames = [] else: send_results.put(status=status) except (KeyboardInterrupt,IOError,OSError): pass finally: rcv_frames.close() send_results.close() conn[0].close() conn[1].close()
def webcam(ip,port,weight_dir,test_crops,ClassIndDir,arch = 'BNInception'): #Perpare the model model = TSN_model(num_classes=101, num_segments=1, modality='RGB', consensus_type='avg', base_model_name='BNInception', new_length=None, before_softmax=True, dropout=0.8, crop_num=1, partial_bn=True) model.eval() model.cuda() #Map class idx with class name into dictionary idx_to_class = IdxtoClass(ClassIndDir) #Load Weights checkpoint = torch.load(weight_dir) print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1'])) base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())} model.load_state_dict(base_dict) if test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(model.scale_size), GroupCenterCrop(model.input_size), ]) elif test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(model.input_size, model.scale_size) ]) #Required transformations transform = torchvision.transforms.Compose([Transforms.Resize(model.scale_size, interpolation = Image.BILINEAR), Transforms.CenterCrop(model.input_size), ToTorchFormatTensor(div= arch != 'BNInception'), GroupNormalize(model.input_mean, model.input_std),]) # Start looping on frames received from webcam softmax = torch.nn.Softmax() nn_output = torch.tensor(np.zeros((1, 101)), dtype=torch.float32).cuda() frame_count = 0 try: frameobj = Frames_rcv(ip,port) frameobj.start() while frameobj.is_alive(): # read each frame and prepare it for feedforward in nn (resize and type) frame = frameobj.get_frame() frame = Image.fromarray(frame) frame = transform(frame).view(1, 3, 224, 224).cuda() #print(frame.size()) # feed the frame to the neural network nn_output += model(frame) # vote for class with 25 consecutive frames if frame_count % 50 == 0: nn_output = softmax(nn_output) nn_output = nn_output.data.cpu().numpy() preds = nn_output.argsort()[0][-5:][::-1] pred_classes = [(idx_to_class[str(pred+1)], nn_output[0, pred]) for pred in preds] # reset the process nn_output = torch.tensor(np.zeros((1, 101)), dtype=torch.float32).cuda() # Display the resulting frame and the classified action font = cv2.FONT_HERSHEY_SIMPLEX y0, dy = 300, 40 for i in range(5): y = y0 + i * dy cv2.putText(orig_frame, '{} - {:.2f}'.format(pred_classes[i][0], pred_classes[i][1]), (5, y), font, 1, (0, 0, 255), 2) cv2.imshow('Webcam', orig_frame) frame_count += 1 if cv2.waitKey(1) & 0xFF == ord('q'): break # When everything done, release the capture frameobj.exit() cv2.destroyAllWindows() except(KeyboardInterrupt,IOError,Exception) as e: frameobj.exit() cv2.destroyAllWindows()