def __init__( self, crop_size: Union[int, List[int]] = VideoConstants.CROP_SIZE, size_range: List[int] = VideoConstants.SIZE_RANGE, mean: List[float] = VideoConstants.MEAN, std: List[float] = VideoConstants.STD, ): """The constructor method of VideoDefaultAugmentTransform class. Args: crop_size: expected output crop_size (height, width) size_range : a 2-tuple denoting the min- and max size mean: a 3-tuple denoting the pixel RGB mean std: a 3-tuple denoting the pixel RGB standard deviation """ self._transform = transforms.Compose( [ transforms_video.ToTensorVideo(), # TODO(zyan3): migrate VideoClipRandomResizeCrop to TorchVision VideoClipRandomResizeCrop(crop_size, size_range), transforms_video.RandomHorizontalFlipVideo(), transforms_video.NormalizeVideo(mean=mean, std=std), ] )
def test_center_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 clip = torch.ones( (numFrames, height, width, 3), dtype=torch.uint8) * 255 oh1 = (height - oheight) // 2 ow1 = (width - owidth) // 2 clipNarrow = clip[:, oh1:oh1 + oheight, ow1:ow1 + owidth, :] clipNarrow.fill_(0) result = Compose([ transforms.ToTensorVideo(), transforms.CenterCropVideo((oheight, owidth)), ])(clip) msg = "height: " + str(height) + " width: " \ + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) self.assertEqual(result.sum().item(), 0, msg) oheight += 1 owidth += 1 result = Compose([ transforms.ToTensorVideo(), transforms.CenterCropVideo((oheight, owidth)), ])(clip) sum1 = result.sum() msg = "height: " + str(height) + " width: " \ + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) self.assertEqual(sum1.item() > 1, True, msg) oheight += 1 owidth += 1 result = Compose([ transforms.ToTensorVideo(), transforms.CenterCropVideo((oheight, owidth)), ])(clip) sum2 = result.sum() msg = "height: " + str(height) + " width: " \ + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) self.assertTrue(sum2.item() > 1, msg) self.assertTrue(sum2.item() > sum1.item(), msg)
def val_transform(s): return transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizeVideo(s), transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), transforms_video.CenterCropVideo(s) ])
def train_transform(s): return transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomHorizontalFlipVideo(), transforms_video.RandomResizeVideo((s, round(s * 1.5))), transforms_video.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), transforms_video.RandomCropVideo(s) ])
def default_transformation_3D(split, size=224): return { "train": transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(size), RandomVerticalFlipVideo(), transforms_video.RandomHorizontalFlipVideo(), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "valid": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), "test": transforms.Compose([ transforms_video.ToTensorVideo(), VideoClipResize(size), # not square transforms_video.CenterCropVideo(size), transforms_video.NormalizeVideo(kinetics400_transform_dict["mean"], kinetics400_transform_dict["std"]), ]), }[split]
def test_random_resized_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = Compose([ transforms.ToTensorVideo(), transforms.RandomResizedCropVideo((oheight, owidth)), ])(clip) self.assertEqual(result.size(2), oheight) self.assertEqual(result.size(3), owidth) transforms.RandomResizedCropVideo((oheight, owidth)).__repr__()
def test_to_tensor_video(self): numFrames, height, width = 64, 4, 4 trans = transforms.ToTensorVideo() with self.assertRaises(TypeError): trans(np.random.rand(numFrames, height, width, 1).tolist()) trans(torch.rand((numFrames, height, width, 1), dtype=torch.float)) with self.assertRaises(ValueError): trans( torch.ones((3, numFrames, height, width, 3), dtype=torch.uint8)) trans(torch.ones((height, width, 3), dtype=torch.uint8)) trans(torch.ones((width, 3), dtype=torch.uint8)) trans(torch.ones((3), dtype=torch.uint8)) trans.__repr__()
def __init__( self, size: int = VideoConstants.SIZE_RANGE[0], mean: List[float] = VideoConstants.MEAN, std: List[float] = VideoConstants.STD, ): """The constructor method of VideoDefaultNoAugmentTransform class. Args: size: the short edge of rescaled video clip mean: a 3-tuple denoting the pixel RGB mean std: a 3-tuple denoting the pixel RGB standard deviation """ self._transform = transforms.Compose( # At testing stage, central cropping is not used because we # conduct fully convolutional-style testing [ transforms_video.ToTensorVideo(), # TODO(zyan3): migrate VideoClipResize to TorchVision VideoClipResize(size), transforms_video.NormalizeVideo(mean=mean, std=std), ])
import torchvision.datasets as datasets from torch.utils.data import DataLoader import torchvision.transforms as transforms import torchvision.transforms._transforms_video as v_transform import torch TRAIN_BATCH_SIZE = 128 TEST_BATCH_SIZE = 128 FRAME_LENGTH = 16 transform = transforms.Compose([ v_transform.ToTensorVideo(), v_transform.NormalizeVideo(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), v_transform.RandomHorizontalFlipVideo(), v_transform.RandomCropVideo(112), ]) def custom_collate(batch): filtered_batch = [] for video, _, label in batch: filtered_batch.append((video, label)) return torch.utils.data.dataloader.default_collate(filtered_batch) trainset = datasets.UCF101( root='data/UCF101/UCF-101', annotation_path= 'data/UCF101TrainTestSplits-RecognitionTask/ucfTrainTestlist', frames_per_clip=FRAME_LENGTH,
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) netG = moco.builder.MaskGenerator() netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(netG) print(netD) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) netG.cuda(args.gpu) netD.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) netG = torch.nn.parallel.DistributedDataParallel( netG, device_ids=[args.gpu], find_unused_parameters=True) netD = torch.nn.parallel.DistributedDataParallel( netD, device_ids=[args.gpu], find_unused_parameters=True) else: netG.cuda() netD.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set netG = torch.nn.parallel.DistributedDataParallel(netG) netD = torch.nn.parallel.DistributedDataParallel(netD) elif args.gpu is not None: torch.cuda.set_device(args.gpu) netG = netG.cuda(args.gpu) netD = netD.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. pass # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu # torch.cuda.synchronize() optimizer_g = torch.optim.SGD(netG.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer_d = torch.optim.SGD(netD.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda(args.gpu) G_criterion = nn.L1Loss().cuda(args.gpu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] netD.load_state_dict(checkpoint['state_dict']) #optimizer_d.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if os.path.isfile(args.resumeG): print("=> loading checkpoint '{}'".format(args.resumeG)) if args.gpu is None: checkpoint = torch.load(args.resumeG) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resumeG, map_location=loc) args.start_epoch = checkpoint['epoch'] netG.load_state_dict(checkpoint['state_dict']) #optimizer_g.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resumeG, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resumeG)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') video_augmentation = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)), ]) audio_augmentation = moco.loader.DummyAudioTransform() augmentation = {'video': video_augmentation, 'audio': audio_augmentation} augmentation_gpu = moco.loader.MoCoAugmentV2( args.crop_size) if args.aug_plus else moco.loader.MoCoAugment( args.crop_size) train_dataset = Kinetics400(traindir, args.frame_per_clip, args.step_between_clips, extensions='mp4', transform=augmentation, num_workers=4) train_sampler = RandomClipSampler(train_dataset.video_clips, 1) if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler) train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True, multiprocessing_context="fork") if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer_d, epoch, args) adjust_learning_rate(optimizer_g, epoch, args) # train for one epoch train(train_loader, augmentation_gpu, criterion, G_criterion, netG, netD, optimizer_g, optimizer_d, epoch, args, writer) if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netG.state_dict(), }, ckp_dir + '/netG', max_save=20, is_best=False) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netD.state_dict(), }, ckp_dir + '/netD', max_save=20, is_best=False)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=============> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # freeze all layers but the last fc # for name, param in model.named_parameters(): # if name not in ['fc.weight', 'fc.bias']: # param.requires_grad = False # init the fc layer model.fc = nn.Linear(512, args.num_class, bias=True) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) #.cuda() for debug on cpu # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) # assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize_video = transforms_video.NormalizeVideo( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) video_augmentation_train = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size), transforms_video.RandomHorizontalFlipVideo(), normalize_video, ]) video_augmentation_val = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.CenterCropVideo(args.crop_size), normalize_video, ]) data_dir = os.path.join(args.data, 'data') anno_dir = os.path.join(args.data, 'anno') audio_augmentation = moco.loader.DummyAudioTransform() train_augmentation = { 'video': video_augmentation_train, 'audio': audio_augmentation } val_augmentation = { 'video': video_augmentation_val, 'audio': audio_augmentation } train_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=True, transform=train_augmentation, num_workers=16) train_sampler = RandomClipSampler(train_dataset.video_clips, 10) if args.distributed: train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, multiprocessing_context="fork") val_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=False, transform=val_augmentation, num_workers=16) # Do not use DistributedSampler since it will destroy the testing iteration process val_sampler = UniformClipSampler(val_dataset.video_clips, args.clip_per_video) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.clip_per_video, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, multiprocessing_context="fork") if args.evaluate: validate(val_loader, model, criterion, args) return if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set val_loss, acc1, acc5 = validate(val_loader, model, criterion, args) if writer is not None: writer.add_scalar('lincls_val/loss', val_loss, epoch) writer.add_scalar('lincls_val/acc1', acc1, epoch) writer.add_scalar('lincls_val/acc5', acc5, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, ckp_dir, max_save=1, is_best=is_best)
def test_build_field_transform_default_video(self): dataset = self.get_test_video_dataset() # transform config is not provided. Use default transforms config = None # default training data transform sample = dataset[0] transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] self.assertEqual( output_clip.size(), torch.Size(( 3, self.frames_per_clip, VideoConstants.CROP_SIZE, VideoConstants.CROP_SIZE, )), ) # default testing data transform sample = dataset[1] sample_copy = copy.deepcopy(sample) expected_output_clip = transforms_video.ToTensorVideo()( sample["input"]["video"]) expected_output_clip = transforms_video.CenterCropVideo( VideoConstants.CROP_SIZE)(expected_output_clip) expected_output_clip = transforms_video.NormalizeVideo( mean=VideoConstants.MEAN, std=VideoConstants.STD)(expected_output_clip) transform = build_video_field_transform_default(config, "test") output_clip = transform(sample_copy)["input"]["video"] rescaled_width = int(VideoConstants.SIZE_RANGE[0] * self.video_width / self.video_height) self.assertEqual( output_clip.size(), torch.Size((3, self.frames_per_clip, VideoConstants.SIZE_RANGE[0], rescaled_width)), ) # transform config is provided. Simulate training config sample = dataset[2] config = { "video": [ { "name": "ToTensorVideo" }, { "name": "video_clip_random_resize_crop", "crop_size": 64, "size_range": [256, 320], }, { "name": "RandomHorizontalFlipVideo" }, { "name": "NormalizeVideo", "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225], }, ] } transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] self.assertEqual(output_clip.size(), torch.Size((3, self.frames_per_clip, 64, 64))) self.assertTrue(output_clip.dtype == torch.float) # transform config is provided. Simulate testing config sample = dataset[3] config = { "video": [ { "name": "ToTensorVideo" }, { "name": "video_clip_resize", "size": 64 }, { "name": "NormalizeVideo", "mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225], }, ] } transform = build_video_field_transform_default(config, "train") output_clip = transform(sample)["input"]["video"] rescaled_width = int(64 * self.video_width / self.video_height) self.assertEqual( output_clip.size(), torch.Size((3, self.frames_per_clip, 64, rescaled_width)), ) self.assertTrue(output_clip.dtype == torch.float)