def __init__(self, args): self.args = args self.device = torch.device(args.device) # Visualizer self.visualizer = TensorboardVisualizer(args, sys.argv) # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = { 'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size } train_dataset = get_segmentation_dataset(args.dataset, split='train', mode='train', **data_kwargs) val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs) args.iters_per_epoch = len(train_dataset) // (args.num_gpus * args.batch_size) args.max_iters = args.epochs * args.iters_per_epoch train_sampler = make_data_sampler(train_dataset, shuffle=True, distributed=args.distributed) train_batch_sampler = make_batch_data_sampler(train_sampler, args.batch_size, args.max_iters) val_sampler = make_data_sampler(val_dataset, False, args.distributed) val_batch_sampler = make_batch_data_sampler(val_sampler, args.batch_size) self.train_loader = data.DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, pin_memory=True) self.val_loader = data.DataLoader(dataset=val_dataset, batch_sampler=val_batch_sampler, num_workers=args.workers, pin_memory=True) # create network BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone, aux=args.aux, norm_layer=BatchNorm2d).to( self.device) # jpu=args.jpu # resume checkpoint if needed if args.resume: if os.path.isfile(args.resume): name, ext = os.path.splitext(args.resume) assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.' print('Resuming training, loading {}...'.format(args.resume)) self.model.load_state_dict( torch.load(args.resume, map_location=lambda storage, loc: storage)) # create criterion self.criterion = get_segmentation_loss(args.model, use_ohem=args.use_ohem, aux=args.aux, aux_weight=args.aux_weight, ignore_index=-1).to(self.device) # optimizer, for model just includes pretrained, head and auxlayer params_list = list() if hasattr(self.model, 'pretrained'): params_list.append({ 'params': self.model.pretrained.parameters(), 'lr': args.lr }) if hasattr(self.model, 'exclusive'): for module in self.model.exclusive: params_list.append({ 'params': getattr(self.model, module).parameters(), 'lr': args.lr * 10 }) self.optimizer = torch.optim.SGD(params_list, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr scheduling self.lr_scheduler = WarmupPolyLR(self.optimizer, max_iters=args.max_iters, power=0.9, warmup_factor=args.warmup_factor, warmup_iters=args.warmup_iters, warmup_method=args.warmup_method) if args.distributed: self.model = nn.parallel.DistributedDataParallel( self.model, device_ids=[args.local_rank], output_device=args.local_rank) # evaluation metrics self.metric = SegmentationMetric(train_dataset.num_class) self.best_pred = 0.0
def __init__(self, args, logger): self.args = args self.logger = logger if get_rank() == 0: TBWriter.init( os.path.join(args.project_dir, args.task_dir, "tbevents") ) self.device = torch.device(args.device) self.meters = MetricLogger(delimiter=" ") # image transform input_transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize( [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ), ] ) # dataset and dataloader data_kwargs = { "transform": input_transform, "base_size": args.base_size, "crop_size": args.crop_size, "root": args.dataroot, } train_dataset = get_segmentation_dataset( args.dataset, split="train", mode="train", **data_kwargs ) val_dataset = get_segmentation_dataset( args.dataset, split="val", mode="val", **data_kwargs ) args.iters_per_epoch = len(train_dataset) // ( args.num_gpus * args.batch_size ) args.max_iters = args.epochs * args.iters_per_epoch train_sampler = make_data_sampler( train_dataset, shuffle=True, distributed=args.distributed ) train_batch_sampler = make_batch_data_sampler( train_sampler, args.batch_size, args.max_iters ) val_sampler = make_data_sampler(val_dataset, False, args.distributed) val_batch_sampler = make_batch_data_sampler( val_sampler, args.batch_size ) self.train_loader = data.DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, pin_memory=True, ) self.val_loader = data.DataLoader( dataset=val_dataset, batch_sampler=val_batch_sampler, num_workers=args.workers, pin_memory=True, ) # create network BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d self.model = get_segmentation_model( model=args.model, dataset=args.dataset, backbone=args.backbone, aux=args.aux, jpu=args.jpu, norm_layer=BatchNorm2d, ).to(self.device) # resume checkpoint if needed if args.resume: if os.path.isfile(args.resume): name, ext = os.path.splitext(args.resume) assert ( ext == ".pkl" or ".pth" ), "Sorry only .pth and .pkl files supported." print("Resuming training, loading {}...".format(args.resume)) self.model.load_state_dict( torch.load( args.resume, map_location=lambda storage, loc: storage ) ) # create criterion self.criterion = get_segmentation_loss( args.model, use_ohem=args.use_ohem, aux=args.aux, aux_weight=args.aux_weight, ignore_index=-1, ).to(self.device) # optimizer, for model just includes pretrained, head and auxlayer params_list = list() if hasattr(self.model, "pretrained"): params_list.append( {"params": self.model.pretrained.parameters(), "lr": args.lr} ) if hasattr(self.model, "exclusive"): for module in self.model.exclusive: params_list.append( { "params": getattr(self.model, module).parameters(), "lr": args.lr * args.lr_scale, } ) self.optimizer = torch.optim.SGD( params_list, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) # lr scheduling self.lr_scheduler = get_lr_scheduler(self.optimizer, args) if args.distributed: self.model = nn.parallel.DistributedDataParallel( self.model, device_ids=[args.local_rank], output_device=args.local_rank, ) # evaluation metrics self.metric = SegmentationMetric(train_dataset.num_class) self.best_pred = 0.0