Пример #1
0
              
          elif model == 'RGBDiff':
              input = data.view(-1, 18, data.size(1), data.size(2))
              output =torch.cat((pre_scoresRGBDiff,model_RGBDiff(input)),0)
              pre_scoresRGBDiff = output.data[-(args.num_segments - args.delta):,]
              
          output_tensor = output.data.mean(dim = 0,keepdim=True)
          
      return output_tensor
  
if args.dataset == 'ucf101':
  num_class = 101
else:
  raise ValueError('Unkown dataset: ' + args.dataset)
  
model_RGB = TSN_model(num_class, 1, 'RGB',
                base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
  
model_RGBDiff = TSN_model(num_class, 1, 'RGBDiff',
                base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
  
for i in range(len(args.weights)):
  #load the weights of your model training
  checkpoint = torch.load(args.weights[i])
  print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1']))

  base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
  if i==0:
      model_RGB.load_state_dict(base_dict)
  else:
      model_RGBDiff.load_state_dict(base_dict)
Пример #2
0
def First_step():
  #num_crop = args.test_crops  
  test_segments = args.test_segments
  num_crop = args.test_crops
  
  #this function do forward propagation and returns scores
  def eval_video(data, model):
      """
      Evaluate single video
      video_data : Tuple has 3 elments (data in shape (crop_number,num_segments*length,H,W), label)
      return     : predictions and labels
      """          
      global pre_scoresRGB
      global pre_scoresRGBDiff
      
      with torch.no_grad():
          #reshape data to be in shape of (num_segments*crop_number,length,H,W)
          #Forword Propagation
          if model == 'RGB':
              input = data.view(-1, 3, data.size(1), data.size(2))
              output = torch.cat((pre_scoresRGB, model_RGB(input)))
              pre_scoresRGB = output.data[-3:,]

          elif model == 'RGBDiff':
              input = data.view(-1, 18, data.size(1), data.size(2))
              output = torch.cat((pre_scoresRGBDiff, model_RGBDiff(input)))
              pre_scoresRGBDiff = output.data[-3:,]
      
          output_np = output.data.cpu().numpy().copy()    
          #Reshape numpy array to (num_crop,num_segments,num_classes)
          output_np = output_np.reshape((num_crop, test_segments*2, num_class))
          #Take mean of cropped images to be in shape (num_segments,1,num_classes)
          output_np = output_np.mean(axis=0).reshape((test_segments*2,1,num_class))
          output_np = output_np.mean(axis=0)
      return output_np      
  

  if args.dataset == 'ucf101':
      num_class = 101
  else:
      raise ValueError('Unkown dataset: ' + args.dataset)
  
  model_RGB = TSN_model(num_class, 1, 'RGB',
                    base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
  
  model_RGBDiff = TSN_model(num_class, 1, 'RGBDiff',
                    base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
  
  for i in range(len(args.weights)):
      #load the weights of your model training
      checkpoint = torch.load(args.weights[i])
      print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1']))
    
      base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
      if i==0:
          model_RGB.load_state_dict(base_dict)
      else:
          model_RGBDiff.load_state_dict(base_dict)
  
  cropping = torchvision.transforms.Compose([
          GroupScale(model_RGB.scale_size),
          GroupCenterCrop(model_RGB.input_size),
      ])
      
  #Required transformations
  transform = torchvision.transforms.Compose([
           cropping,
           Stack(roll=args.arch == 'BNInception'),
           ToTorchFormatTensor(div=args.arch != 'BNInception'),
           GroupNormalize(model_RGB.input_mean, model_RGB.input_std),
                   ])
    
    
  if args.gpus is not None:
      devices = [args.gpus[i] for i in range(args.workers)]
  else:
      devices = list(range(args.workers))
    
  model_RGB = torch.nn.DataParallel(model_RGB.cuda(devices[0]), device_ids=devices)
  model_RGBDiff = torch.nn.DataParallel(model_RGBDiff.cuda(devices[0]), device_ids=devices)
         
  model_RGB.eval()
  model_RGBDiff.eval()     

  softmax = torch.nn.Softmax()
  scores = torch.tensor(np.zeros((1,101)), dtype=torch.float32).cuda()
   
  frames = []  
  action_checker=True

  conn,transport = set_server(ip="0.0.0.0",port=args.port,Tunnel=True,n_conn=2,hostname= args.hostname)
  if conn is None:
      return 
  
  try: 
    top5_actions = Top_N(args.classInd_file)
    rcv_frames = rcv_frames_thread(connection=conn[0])
    send_results = send_results_thread(connection=conn[1],test=args.test)

    
    while (rcv_frames.isAlive() and send_results.isAlive()):
        if rcv_frames.CheckReset():
            frames = []
            rcv_frames.ConfirmReset()

        frame,status = rcv_frames.get()


        if frame is 0:
            break

        if frame is None:
           continue
      
        frame = Image.fromarray(frame)
        
        frames.append(frame)
      
        if len(frames) == test_segments*6:       
            frames = transform(frames).cuda()
            scores_RGB = eval_video(frames[0:len(frames):6], 'RGB')   
            scores_RGBDiff = eval_video(frames[:], 'RGBDiff')
         
            final_scores = args.score_weights[0]*scores_RGB + args.score_weights[1] * scores_RGBDiff
            #final_scores = softmax(torch.FloatTensor(final_scores))
            #final_scores = final_scores.data.cpu().numpy().copy()
            #five_scores = np.argsort(final_scores)[0][::-1][:5]
            top5_actions.import_scores(final_scores[0,])
            indices_TopN,_,scores_TopN = top5_actions.get_top_N_actions()
            action_checker = Evaluation(scores_TopN, args.psi)
            
            send_results.put(status=status,scores=(*indices_TopN,*scores_TopN),Actf=action_checker)
            frames = [] 
          
        else:
            send_results.put(status=status,Actf=action_checker)
          
  except (KeyboardInterrupt,IOError,OSError):
    pass
  finally:
    rcv_frames.close()
    send_results.close()
    conn[0].close()
    conn[1].close()
    if bool(transport):
        transport.close()
Пример #3
0
parser.add_argument('--gpus', nargs='+', type=int, default=None)
parser.add_argument('--flow_prefix', type=str, default='')
parser.add_argument('--classInd_file', type=str, default='')

args = parser.parse_args()

if args.dataset == 'ucf101':
    num_class = 101
else:
    raise ValueError('Unkown dataset: ' + args.dataset)

#later, it will define number of segments=25 for each video, so
#number of segments is set to 1 here to take 25 snippets from each video.
model = TSN_model(num_class,
                  1,
                  args.modality,
                  base_model_name=args.arch,
                  consensus_type=args.crop_fusion_type,
                  dropout=args.dropout)

#load the weights from the file saved during training process.
#args.weights is simply a string refers to the path of the file.
checkpoint = torch.load(args.weights)
print("epoch {}, best acc1@: {}".format(checkpoint['epoch'],
                                        checkpoint['best_acc1']))

base_dict = {
    '.'.join(k.split('.')[1:]): v
    for k, v in list(checkpoint['state_dict'].items())
}
model.load_state_dict(base_dict)
Пример #4
0
def main():
    global args, best_acc1
    args = parser.parse_args()

    if args.dataset == 'ucf101':
        num_classes = 101
    else:
        raise ValueError('Unknown dataset: ' + args.dataset)

    model = TSN_model(num_classes,
                      args.num_segments,
                      args.modality,
                      base_model_name=args.arch,
                      consensus_type=args.consensus_type,
                      dropout=args.dropout,
                      partial_bn=not args.no_partialbn,
                      KinWeights=args.KinWeights)

    crop_size = model.crop_size
    scale_size = model.scale_size
    input_mean = model.input_mean
    input_std = model.input_std
    policies = model.get_optim_policies()
    train_augmentation = model.get_augmentation()

    #to use multiple GPUs, args.gpus is a list (e.g. to use 4 GPUs, device_ids=[0,1,2,3]).
    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()

    #args.resume is an empty string to provide the path to the latest checkpoint.
    if args.resume:
        #if there is a file, do the following:
        if os.path.isfile(args.resume):
            print(("Loading checkpoint '{}'".format(args.resume)))
            #load the parameters.
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            print(("Loaded checkpoint '{}' epoch {}".format(
                args.evaluate, checkpoint['epoch'])))

        else:
            print(("No checkpoint found at '{}'".format(args.resume)))

    #this flag allows you to enable the inbuilt cudnn auto-tuner to find
    #the best algorithm to use for your hardware.
    #but if the input sizes change at each iteration, this will lead to worse runtime.
    cudnn.benchmark = True

    #different techniques of normalization is used for RGB & RGB Difference. ########
    if args.modality != 'RGBDiff':
        normalize = GroupNormalize(input_mean, input_std)
    else:
        normalize = IdentityTransform()

    #for RGB, we only take 1 frame per segment.
    #for RGBDiff, we take 5 consecutive frames per segment.
    if args.modality == 'RGB':
        data_length = 1
    else:
        data_length = 5

    #load the data using built-in PyTorch function torch.utils.data.DataLoader. ##########
    train_loader = torch.utils.data.DataLoader(
        TSNDataset(
            args.train_list,
            num_segments=args.num_segments,
            new_length=data_length,
            modality=args.modality,
            image_prefix='frame{:06d}.jpg',
            transform=torchvision.transforms.Compose([
                train_augmentation,
                Stack(roll=args.arch == 'BNInception'),  #########
                #convert RGB image with (H x W x C) to tensor of shape (C x H x W).
                #from range [0, 255] to [0 1]
                ToTorchFormatTensor(div=args.arch != 'BNInception'),
                normalize,
            ])),

        # how many subprocesses to use for data loading. 0 means that the data will be loaded
        #in the main process.
        # Having more workers will increase the memory usage.WARNING alot of workers
        #with larg batch size will cosume all the ram.
        #Optimal value of workers is the number of cpu cores as each core is responsable
        #to deliver one of batches.
        #4 or 8 would be ok. more will distract and consume the cpu.
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True)

    val_loader = torch.utils.data.DataLoader(TSNDataset(
        args.val_list,
        num_segments=args.num_segments,
        new_length=data_length,
        modality=args.modality,
        image_prefix='frame{:06d}.jpg',
        train_val_switch=False,
        transform=torchvision.transforms.Compose([
            GroupScale(int(scale_size)),
            GroupCenterCrop(crop_size),
            Stack(roll=args.arch == 'BNInception'),
            ToTorchFormatTensor(div=args.arch != 'BNInception'),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    #only one loss type if defined (CrossEntropy).
    if args.loss_type == 'nll':
        criterion = torch.nn.CrossEntropyLoss().cuda()
    else:
        raise ValueError("Unkown loss type")

    print('---------------------------------------------------')
    for group in policies:

        print(('group: {} has {} params, lr_mult: {}, decay_mult: {}'.format(
            group['name'], len(group['params']), group['lr_mult'],
            group['decay_mult'])))

    print('---------------------------------------------------')

    #Stochastic Gradient Decent.
    optimizer = torch.optim.SGD(policies,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    if args.evaluate:
        validate(val_loader, model, criterion, 0)
        #this is used for the same reason as break in loops.
        return

    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch, args.lr_steps)

        #train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        #evaluate on validation set
        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
            acc1 = validate(val_loader, model, criterion,
                            (epoch + 1) * len(train_loader))
            # remember best acc@1 and save checkpoint.
            is_best = acc1 > best_acc1
            best_acc1 = max(acc1, best_acc1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                }, is_best)
Пример #5
0
def First_step():
  num_crop = args.test_crops  
  test_segments = args.test_segments
  
  #this function do forward propagation and returns scores
  def eval_video(data):
      """
      Evaluate single video
      video_data : Tuple has 3 elments (data in shape (crop_number,num_segments*length,H,W), label)
      return     : predictions and labels
      """
      if args.modality == 'RGB':
          length = 3
      elif args.modality == 'RGBDiff':
          length = 18
      else:
          raise ValueError("Unknown modality " + args.modality)
    
      with torch.no_grad():
          #reshape data to be in shape of (num_segments*crop_number,length,H,W)
          input = data.view(-1, length, data.size(1), data.size(2))
          #Forword Propagation
          output = model(input)
          output_np = output.data.cpu().numpy().copy()
          #Reshape numpy array to (num_crop,num_segments,num_classes)
          output_np = output_np.reshape((num_crop, test_segments, num_class))
          #Take mean of cropped images to be in shape (num_segments,1,num_classes)
          output_np = output_np.mean(axis=0).reshape((test_segments,1,num_class))
          output_np = output_np.mean(axis=0)
      return output_np   
    
    
  action_label = label_dic(args.classInd_file)

  if args.dataset == 'ucf101':
      num_class = 101
  else:
      raise ValueError('Unkown dataset: ' + args.dataset)
  
  model = TSN_model(num_class, 1, args.modality,
                    base_model_name=args.arch, consensus_type='avg', dropout=args.dropout)
  
  #load the weights of your model training
  checkpoint = torch.load(args.weights)
  print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1']))

  base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
  model.load_state_dict(base_dict)
  
  #test_crops is set to 1 for fast video evaluation
  if args.test_crops == 1:
      cropping = torchvision.transforms.Compose([
          GroupScale(model.scale_size),
          GroupCenterCrop(model.input_size),
      ])
  elif args.test_crops == 10:
      cropping = torchvision.transforms.Compose([
          GroupOverSample(model.input_size, model.scale_size)
      ])
  else:
      raise ValueError("Only 1 and 10 crops are supported while we got {}".format(test_crops))
      
  #Required transformations
  transform = torchvision.transforms.Compose([
           cropping,
           Stack(roll=args.arch == 'BNInception'),
           ToTorchFormatTensor(div=args.arch != 'BNInception'),
           GroupNormalize(model.input_mean, model.input_std),
                   ])
    
    
  if args.gpus is not None:
      devices = [args.gpus[i] for i in range(args.workers)]
  else:
      devices = list(range(args.workers))
    
  model = torch.nn.DataParallel(model.cuda(devices[0]), device_ids=devices)
         
  model.eval()    

  softmax = torch.nn.Softmax()
  scores = torch.tensor(np.zeros((1,101)), dtype=torch.float32).cuda()
   
  frames = []  
  frame_count = 0 
  try: 
    top5_actions = Top_N(args.classInd_file)
    Tunnel_ = True
    conn,T_thr = Network.set_server(port=6666,Tunnel=Tunnel_,n=1)
    rcv_frames = Streaming.rcv_frames_thread(connection=conn[0])
    send_results = Streaming.send_results_thread(connection=conn[1])
    while (rcv_frames.isAlive() and send_results.isAlive()):
        frame,status = rcv_frames.get()
        if frame is 0:
          break
        frame_count += 1
        frame = Image.fromarray(frame)
        
        if args.modality == 'RGB':             
            frames.append(frame)
            if frame_count % 5 == 0 and frame_count != 0:
                frames = transform(frames).cuda()
                scores = eval_video(frames)
                scores = softmax(torch.FloatTensor(scores))
                scores = scores.data.cpu().numpy().copy()
                top5_actions.import_scores(scores[0,])
                indecies,_,scores = top5_actions.get_top_N_actions()
                send_results.put(status=status,scores=(*indecies,*scores))
                
                frames = []
            else:
                send_results.put(status=status)
  except (KeyboardInterrupt,IOError,OSError):
    pass
  finally:
    rcv_frames.close()
    send_results.close()
    conn[0].close()
    conn[1].close()
Пример #6
0
def webcam(ip,port,weight_dir,test_crops,ClassIndDir,arch = 'BNInception'):
  
  #Perpare the model
  model = TSN_model(num_classes=101, num_segments=1, modality='RGB',
                  consensus_type='avg', base_model_name='BNInception',
                  new_length=None, before_softmax=True, dropout=0.8,
                  crop_num=1, partial_bn=True)
  
  model.eval()
  
  model.cuda()
  
  #Map class idx with class name into dictionary
  idx_to_class = IdxtoClass(ClassIndDir)
  
  #Load Weights
  checkpoint = torch.load(weight_dir)
  print("epoch {}, best acc1@: {}" .format(checkpoint['epoch'], checkpoint['best_acc1']))

  base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
  model.load_state_dict(base_dict)
  
  
  if test_crops == 1:
    cropping = torchvision.transforms.Compose([
        GroupScale(model.scale_size),
        GroupCenterCrop(model.input_size),
    ])
  elif test_crops == 10:
    cropping = torchvision.transforms.Compose([
        GroupOverSample(model.input_size, model.scale_size)
    ])
    
  #Required transformations
  transform = torchvision.transforms.Compose([Transforms.Resize(model.scale_size, interpolation = Image.BILINEAR),
                      Transforms.CenterCrop(model.input_size),
                                
                      ToTorchFormatTensor(div= arch != 'BNInception'),
                      GroupNormalize(model.input_mean, model.input_std),])
  
  # Start looping on frames received from webcam
  softmax = torch.nn.Softmax()
  nn_output = torch.tensor(np.zeros((1, 101)), dtype=torch.float32).cuda()
  frame_count = 0
  try:
    frameobj = Frames_rcv(ip,port)
    frameobj.start()
    while frameobj.is_alive():
        # read each frame and prepare it for feedforward in nn (resize and type)
        frame = frameobj.get_frame()
        frame = Image.fromarray(frame)
        frame = transform(frame).view(1, 3, 224, 224).cuda()
        #print(frame.size())

        # feed the frame to the neural network  
        nn_output += model(frame)

        # vote for class with 25 consecutive frames
        if frame_count % 50 == 0:
            nn_output = softmax(nn_output)
            nn_output = nn_output.data.cpu().numpy()
            preds = nn_output.argsort()[0][-5:][::-1]
            pred_classes = [(idx_to_class[str(pred+1)], nn_output[0, pred]) for pred in preds]

            # reset the process
            nn_output = torch.tensor(np.zeros((1, 101)), dtype=torch.float32).cuda()

        # Display the resulting frame and the classified action
        font = cv2.FONT_HERSHEY_SIMPLEX
        y0, dy = 300, 40
        for i in range(5):
            y = y0 + i * dy
            cv2.putText(orig_frame, '{} - {:.2f}'.format(pred_classes[i][0], pred_classes[i][1]),
                        (5, y), font, 1, (0, 0, 255), 2)

        cv2.imshow('Webcam', orig_frame)
        frame_count += 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # When everything done, release the capture
    frameobj.exit()
    cv2.destroyAllWindows()
  except(KeyboardInterrupt,IOError,Exception) as e:
    frameobj.exit()
    cv2.destroyAllWindows()