def main(): args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." model = models.__dict__[args.arch]().cuda() if args.distributed: model = DDP(model) data, train_sampler = torch_loader(f'{args.data}-sz/160', 128, 256) learner = Learner.from_model_data(model, data) learner.crit = F.cross_entropy learner.metrics = [accuracy, top5] if args.fp16: learner.half() wd=2e-5 update_model_dir(learner, args.save_dir) fit(learner, '1', 0.03, 1, train_sampler, wd) data, train_sampler = torch_loader(f'{args.data}-sz/320', 128, 256) learner.set_data(data) fit(learner, '3', 1e-1, 1, train_sampler, wd) data, train_sampler = torch_loader(args.data, 128, 256) learner.set_data(data) fit(learner, '3', 1e-1, 1, train_sampler, wd) print('Finished!')
def _check_and_init_distributed_model(self): if not self.options.use_data_parallel_distributed: return if not dist.is_initialized(): world_size = self.options.dist_world_size url = self.options.dist_url rank = self.options.dist_rank # This is for SLURM's special use case if rank == -1: rank = int(os.environ.get("SLURM_NODEID")) print("=> Distributed training: world size: {}, rank: {}, URL: {}". format(world_size, rank, url)) dist.init_process_group(backend="nccl", init_method=url, rank=rank, world_size=world_size) # Initialize the distributed data parallel model master_gpu = self.options.gpu if master_gpu is None or master_gpu < 0: raise RuntimeError("Distributed training requires " "to put the model on the GPU, but the GPU is " "not given in the argument") # This is needed for distributed model since the distributed model # initialization will require the model be on the GPU, even though # the later code will put the same model on the GPU again with # self.options.gpu, so this should be ok self.resnet.cuda(master_gpu) self.resnet = nn.parallel.DistributedDataParallel( self.resnet, output_device=master_gpu)
def init_platform(): config_file = cfg_from_file('config.yml') default_file = cfg_from_file('default.yml') logger.info(pprint.pformat(default_file)) logger.info(pprint.pformat(config_file)) merge_a_into_b(config_file, config) merge_a_into_b(default_file, default) default.best_model_path = '' if default.gpu == '': default.gpu = None if default.gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = default.gpu default.distributed = default.world_size > 1 if default.distributed: dist.init_process_group(backend=default.dist_backend, init_method=default.dist_url, world_size=default.world_size) default.lr_epoch = [int(ep) for ep in default.lr_step.split(',')] if default.seed is not None: seed = default.seed np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) cudnn.deterministic = True
def main(): args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: model = models.__dict__[args.arch](pretrained=True) else: model = models.__dict__[args.arch]() model = model.cuda() if args.distributed: model = DDP(model) data, train_sampler = torch_loader(f'{args.data}-160', 128, 256) learner = Learner.from_model_data(model, data) learner.crit = F.cross_entropy learner.metrics = [accuracy, top5] if args.fp16: learner.half() update_model_dir(learner, args.save_dir) wd=1e-4 lr=0.1 data, train_sampler = torch_loader(args.data, 224, 192) learner.set_data(data) fit(learner, '1', lr/4, 1, train_sampler, wd) fit(learner, '1', lr/2, 1, train_sampler, wd) fit(learner, '2', lr, 28, train_sampler, wd) #data, train_sampler = torch_loader(args.data, 224, 192) #learner.set_data(data) #fit(learner, '3', lr, 5, train_sampler, wd) fit(learner, '4', lr/10, 25, train_sampler, wd) fit(learner, '5', lr/100, 25, train_sampler, wd) data, train_sampler = torch_loader(args.data, 288, 128, min_scale=0.5) learner.set_data(data) fit(learner, '6', lr/500, 10, train_sampler, wd) #save_sched(learner.sched, args.save_dir) #fit(learner, '7', 1e-4, 10, train_sampler, wd/4) # TTA works ~50% of the time. Hoping top5 works better print('\n TTA \n') log_preds,y = learner.TTA() preds = np.mean(np.exp(log_preds),0) acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y)) t5 = top5(torch.FloatTensor(preds),torch.LongTensor(y)) print('TTA acc:', acc) print('TTA top5:', t5[0]) with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f: f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracy: {acc}\tTop5: {t5}") print('Finished!')
def __init__(self, party, master_addr='127.0.0.1', master_port='29500'): self.party = party if party: self.other = 0 else: self.other = 1 os.environ['MASTER_ADDR'] = master_addr os.environ['MASTER_PORT'] = master_port # currently only supports sending between 2 parties over tcp dist.init_process_group('tcp', rank=party, world_size=2)
def main(): args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "missing cudnn" model = cifar10models.__dict__[args.arch] if args.arch in cifar10_names else models.__dict__[args.arch] if args.pretrained: model = model(pretrained=True) else: model = model() model = model.cuda() if args.distributed: model = DDP(model) if args.data_parallel: model = nn.DataParallel(model, [0,1,2,3]) data, train_sampler = torch_loader(args.data, args.sz) learner = Learner.from_model_data(model, data) #print (learner.summary()); exit() learner.crit = F.cross_entropy learner.metrics = [accuracy] if args.fp16: learner.half() if args.prof: args.epochs,args.cycle_len = 1,0.01 if args.use_clr: args.use_clr = tuple(map(float, args.use_clr.split(','))) # Full size update_model_dir(learner, args.save_dir) sargs = save_args('first_run', args.save_dir) if args.warmup: learner.fit(args.lr/10, 1, cycle_len=1, sampler=train_sampler, wds=args.weight_decay, use_clr_beta=(100,1,0.9,0.8), loss_scale=args.loss_scale, **sargs) learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len, sampler=train_sampler, wds=args.weight_decay, use_clr_beta=args.use_clr, loss_scale=args.loss_scale, **sargs) save_sched(learner.sched, args.save_dir) print('Finished!') if args.use_tta: log_preds,y = learner.TTA() preds = np.mean(np.exp(log_preds),0) acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y)) print('TTA acc:', acc) with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f: f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracty: {acc}\n")
def test_mpi(): dist.init_process_group('mpi') world_size = dist.get_world_size() rank = dist.get_rank() vector = [0] * world_size vector[rank] = 1 vector = torch.DoubleTensor(vector) dist.all_reduce(vector, op=dist.reduce_op.SUM) print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
def _run(self, rank): self.rank = rank try: dist.init_process_group(backend=BACKEND) except RuntimeError as e: if 'recompile' in e.args[0]: sys.exit(0) # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() sys.exit(0)
def initialize(options): """Initialize environment and add additional information.""" if not (hasattr(dist, '_initialized') and dist._initialized): dist.init_process_group(options.comm_backend) config_logging(options) config_pytorch(options) config_path(options) return options
def _run(self, rank): self.rank = rank try: dist.init_process_group(init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE)) except RuntimeError as e: if 'recompile' in e.args[0]: sys.exit(0) raise # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() sys.exit(0)
def _init_dist_pytorch(backend, **kwargs): # TODO: use local_rank instead of rank % num_gpus rank = int(os.environ["RANK"]) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs)
def test_synchronize_sgd(): torch.manual_seed(42) dist.init_process_group('mpi') rank = dist.get_rank() world_size = dist.get_world_size() device = torch.device('cpu') # device = torch.device('cuda') # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs x = torch.randn(N, D_in, device=device) y = torch.randn(N, D_out, device=device) x = x[rank::world_size] y = y[rank::world_size] # Create random Tensors for weights; setting requires_grad=True means that we # want to compute gradients for these Tensors during the backward pass. w1 = torch.randn(D_in, H, device=device, requires_grad=True) w2 = torch.randn(H, D_out, device=device, requires_grad=True) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y using operations on Tensors. Since w1 and # w2 have requires_grad=True, operations involving these Tensors will cause # PyTorch to build a computational graph, allowing automatic computation of # gradients. Since we are no longer implementing the backward pass by hand we # don't need to keep references to intermediate values. y_pred = x.mm(w1).clamp(min=0).mm(w2) # Compute and print loss. Loss is a Tensor of shape (), and loss.item() # is a Python number giving its value. loss = (y_pred - y).pow(2).sum() if rank == 0: print("Iter {} : {:10.3e}".format(t, loss.item())) # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1.grad and w2.grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. loss.backward() # Update weights using gradient descent. For this step we just want to mutate # the values of w1 and w2 in-place; we don't want to build up a computational # graph for the update steps, so we use the torch.no_grad() context manager # to prevent PyTorch from building a computational graph for the updates with torch.no_grad(): w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after running the backward pass w1.grad.zero_() w2.grad.zero_() # Synchronize weights dist.all_reduce(w1, op=dist.reduce_op.SUM) dist.all_reduce(w2, op=dist.reduce_op.SUM) w1 /= world_size w2 /= world_size
def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count if __name__ == '__main__': args = parser.parse_args() args.distributed = args.world_size > 1 main_proc = True if args.distributed: if args.gpu_rank: torch.cuda.set_device(int(args.gpu_rank)) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) main_proc = args.rank == 0 # Only the first proc should save models save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor( args.epochs) best_wer = None if args.visdom and main_proc: from visdom import Visdom viz = Visdom() opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER']) viz_window = None epochs = torch.arange(1, args.epochs + 1) if args.tensorboard and main_proc: os.makedirs(args.log_dir, exist_ok=True)
def main(): args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.cycle_len > 1: args.cycle_len = int(args.cycle_len) # create model if args.pretrained: model = models.__dict__[args.arch](pretrained=True) else: model = models.__dict__[args.arch]() model = model.cuda() if args.distributed: model = DDP(model) if args.train_128: data, train_sampler = torch_loader(f'{args.data}-160', 128) else: data, train_sampler = torch_loader(args.data, args.sz) learner = Learner.from_model_data(model, data) learner.crit = F.cross_entropy learner.metrics = [accuracy, top5] if args.fp16: learner.half() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.resume.endswith('.h5'): args.resume = args.resume[:-len('.h5')] learner.load(args.resume) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.prof: args.epochs = 1 args.cycle_len=1 if args.use_clr: args.use_clr = tuple(map(float, args.use_clr.split(','))) # 128x128 if args.train_128: save_dir = f'{args.save_dir}/128' update_model_dir(learner, save_dir) sargs = save_args('first_run_128', save_dir) learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len, sampler=train_sampler, wds=args.weight_decay, use_clr_beta=args.use_clr, loss_scale=args.loss_scale, **sargs) save_sched(learner.sched, save_dir) data, train_sampler = torch_loader(args.data, args.sz) learner.set_data(data) # Full size update_model_dir(learner, args.save_dir) sargs = save_args('first_run', args.save_dir) learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len, sampler=train_sampler, wds=args.weight_decay, use_clr_beta=args.use_clr, loss_scale=args.loss_scale, **sargs) save_sched(learner.sched, args.save_dir) # TTA works ~50% of the time. Hoping top5 works better if args.use_tta: log_preds,y = learner.TTA() preds = np.mean(np.exp(log_preds),0) acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y)) t5 = top5(torch.FloatTensor(preds),torch.LongTensor(y)) print('TTA acc:', acc) print('TTA top5:', t5[0]) with open(f'{args.save_dir}/tta_accuracy.txt', "a", 1) as f: f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracy: {acc}\tTop5: {t5}") print('Finished!')
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if not os.path.exists(args.save): os.makedirs(args.save) if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.scratch: checkpoint = torch.load(args.scratch) model = resnet34(cfg=checkpoint['cfg']) if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return history_score = np.zeros((args.epochs + 1, 1)) np.savetxt(os.path.join(args.save, 'record.txt'), history_score, fmt='%10.5f', delimiter=',') for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) history_score[epoch] = prec1 np.savetxt(os.path.join(args.save, 'record.txt'), history_score, fmt='%10.5f', delimiter=',') # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, args.save) history_score[-1] = best_prec1 np.savetxt(os.path.join(args.save, 'record.txt'), history_score, fmt='%10.5f', delimiter=',')
def main(): global args, best_prec1 if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # print(args.resume) # model = torch.load(args.resume) model = ResNetDCT_Upscaled_Static(channels=int(args.subset), input_gate=True, pretrained=False) model.fc1 = nn.Linear(model.fc1.in_features, 49) model = nn.DataParallel(model) cudnn.benchmark = True print(args.resume) model.load_state_dict(torch.load(args.resume)) # # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() # optimizer = torch.optim.SGD(model.parameters(), # lr=0.001, # momentum=0.9, # weight_decay=5e-4) # # Resume # print("1") # title = 'ImageNet-' + args.arch # if not os.path.isdir(args.checkpoint): # mkdir_p(args.checkpoint) # if args.resume: # # Load checkpoint. # print('==> Resuming from checkpoint..') # checkpoint = torch.load(args.resume) # best_prec1 = checkpoint['best_prec1'] # model_dict = model.state_dict() # pretrained_dict = { # k: v # for k, v in checkpoint['state_dict'].items() if k in model_dict # } # model_dict.update(pretrained_dict) # model.load_state_dict(model_dict, strict=False) # #model.load_state_dict(checkpoint['state_dict'], strict=False) # args.checkpoint = os.path.dirname(args.resume) # if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): # model.features = torch.nn.DataParallel(model.features) # model = model.cuda() # else: # model = torch.nn.DataParallel(model).cuda() # cudnn.benchmark = True # print('Total params: %.2fM' % # (sum(p.numel() for p in model.parameters()) / 1000000.0)) # Data loading code # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, # step_size=10, # gamma=0.1) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [160, 180], # 0.1) test_loader, test_val = testloader_upscaled_static(args, model='resnet') print('Evaluation only') test_model( model, test_loader, ) # test1(model, val_loader) # test(model) return
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() n_dev = torch.cuda.device_count() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) #args.lr *= n_dev elif args.dp: model = nn.DataParallel(model) args.batch_size *= n_dev #args.lr *= n_dev global param_copy if args.fp16: param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()] for param in param_copy: param.requires_grad = True else: param_copy = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(args.sz), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(int(args.sz*1.14)), transforms.CenterCrop(args.sz), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)
# global parameters num_warmup = 1 num_benchmark = 5 # get env variables comm_addr = os.getenv("SLURM_SRUN_COMM_HOST") comm_size = int(os.getenv("SLURM_NTASKS")) comm_rank = int(os.getenv("PMI_RANK")) comm_local_rank = comm_rank % torch.cuda.device_count() comm_port = "29500" os.environ["MASTER_ADDR"] = comm_addr os.environ["MASTER_PORT"] = comm_port # init process group dist.init_process_group(backend="nccl", rank=comm_rank, world_size=comm_size) #load parameters params = YParams("config/UNet_transpose.yaml", "default") device = torch.device("cuda:{}".format(comm_local_rank)) # get data loader dist.barrier() tstart = time.time() train_data_loader = get_data_loader_distributed(params, comm_rank, comm_local_rank) dist.barrier() tend = time.time() if comm_rank == 0: print("Setup: took {}s".format(tend - tstart))
assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) log_dir = increment_dir(Path(opt.logdir) / 'exp', opt.name) # runs/exp1 device = select_device(opt.device, batch_size=opt.batch_size) # DDP mode if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size logger.info(opt) with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps # Train if not opt.evolve: tb_writer = None if opt.global_rank in [-1, 0]: logger.info( 'Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir) tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError("Please install APEX from https://github.com/nvidia/apex") if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist # ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 print(dist.get_rank(), "Using seed = {}".format(local_seed)) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader ssd_print(key=mlperf_log.INPUT_SHARD, value=None) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size current_lr = args.lr * (global_batch_size / 32) current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v:k for k,v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == (160000 * 32) // global_batch_size: current_lr *= 0.1 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == (200000 * 32) // global_batch_size: current_lr *= 0.1 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1,2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() if args.warmup is not None: lr_warmup(optim, args.warmup, iter_num, current_lr, args) optim.step() if iter_num in eval_points: rank = dist.get_rank() if args.distributed else args.local_rank if rank == 0: if not args.no_save: print("") print("saving model...") torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch,iter_num): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.all_reduce(success, op=dist.reduce_op.MAX) if success[0]: return True iter_num += 1 return False
def main(cfg, local_rank): torch.manual_seed(cfg.SEED) torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK Dataset = get_dataset(cfg.SAMPLE_METHOD, cfg.TASK) print('Creating model...') model = create_model(cfg.MODEL.NAME, cfg.MODEL.HEAD_CONV, cfg) num_gpus = torch.cuda.device_count() if cfg.TRAIN.DISTRIBUTE: device = torch.device('cuda:%d'%local_rank) torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://', world_size=num_gpus, rank=local_rank) else: device = torch.device('cuda') logger = Logger(cfg) if cfg.TRAIN.OPTIMIZER=='adam': optimizer = torch.optim.Adam(model.parameters(), cfg.TRAIN.LR) elif cfg.TRAIN.OPTIMIZER== 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=0.9) else: NotImplementedError start_epoch = 0 #if cfg.MODEL.INIT_WEIGHTS: # model, optimizer, start_epoch = load_model( # model, cfg.MODEL.PRETRAINED, optimizer, cfg.TRAIN.RESUME, cfg.TRAIN.LR, cfg.TRAIN.LR_STEP) Trainer = train_factory[cfg.TASK] trainer = Trainer(cfg, local_rank, model, optimizer) cfg.TRAIN.MASTER_BATCH_SIZE if cfg.TRAIN.MASTER_BATCH_SIZE == -1: master_batch_size = cfg.TRAIN.BATCH_SIZE // len(cfg.GPUS) else: master_batch_size = cfg.TRAIN.MASTER_BATCH_SIZE rest_batch_size = (cfg.TRAIN.BATCH_SIZE - master_batch_size) chunk_sizes = [cfg.TRAIN.MASTER_BATCH_SIZE] for i in range(len(cfg.GPUS) - 1): slave_chunk_size = rest_batch_size // (len(cfg.GPUS) - 1) if i < rest_batch_size % (len(cfg.GPUS) - 1): slave_chunk_size += 1 chunk_sizes.append(slave_chunk_size) trainer.set_device(cfg.GPUS, chunk_sizes, device) print('Setting up data...') val_dataset = Dataset(cfg, 'val') val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True ) train_dataset = Dataset(cfg, 'train') train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=num_gpus, rank=local_rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE//num_gpus if cfg.TRAIN.DISTRIBUTE else cfg.TRAIN.BATCH_SIZE, shuffle=not cfg.TRAIN.DISTRIBUTE, num_workers=cfg.WORKERS, pin_memory=True, drop_last=True, sampler = train_sampler if cfg.TRAIN.DISTRIBUTE else None ) print('Starting training...') best = 0. for epoch in range(start_epoch + 1, cfg.TRAIN.EPOCHS + 1): mark = epoch if cfg.TRAIN.SAVE_ALL_MODEL else 'last' train_sampler.set_epoch(epoch) log_dict_train, _ = trainer.train(epoch, train_loader) logger.write('epoch: {} |'.format(epoch)) for k, v in log_dict_train.items(): logger.scalar_summary('train_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if cfg.TRAIN.VAL_INTERVALS > 0 and epoch % cfg.TRAIN.VAL_INTERVALS == 0: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(mark)), epoch, model, optimizer) with torch.no_grad(): log_dict_val, preds = trainer.val(epoch, val_loader) mAP = val_dataset.run_eval(preds, cfg.OUTPUT_DIR) print('mAP is: ', mAP) for k, v in log_dict_val.items(): logger.scalar_summary('val_{}'.format(k), v, epoch) logger.write('{} {:8f} | '.format(k, v)) if mAP > best: best = mAP save_model(os.path.join(cfg.OUTPUT_DIR, 'model_best.pth'), epoch, model) else: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_last.pth'), epoch, model, optimizer) logger.write('\n') if epoch in cfg.TRAIN.LR_STEP: save_model(os.path.join(cfg.OUTPUT_DIR, 'model_{}.pth'.format(epoch)), epoch, model, optimizer) lr = cfg.TRAIN.LR * (0.1 ** (cfg.TRAIN.LR_STEP.index(epoch) + 1)) print('Drop LR to', lr) for param_group in optimizer.param_groups: param_group['lr'] = lr logger.close()
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) else: print("We should be distributed") return # create model model = None if args.arch == "alexnet": model = AlexNet() if args.arch == "vgg16": model = VGG16() if args.arch == "vgg19": model = VGG19() if args.arch == "vgg13": model = VGG13() if args.arch == "lenet": model = LeNet() if args.arch == "resnet50": model = resnet50() if args.arch == "goolenet": model = GoogLeNet() torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model = distributed_layer_sparse_whole_randperm_threshold_mem.DistributedDataParallel( model, device_id=args.gpu, sparse_ratio=args.sparse_ratio, sparse_threshold=args.sparse_threshold, mem_decay=args.memory_decay) args.file_name = args.method + '_' + args.arch + "batch-size_" + str(args.batch_size) + "_sparse-ratio_" + str(args.sparse_ratio) + \ "_sparse-threshold_" + str(args.sparse_threshold) + "_memory-decay_" + str(args.memory_decay) criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code #traindir = os.path.join(args.data, 'train') #valdir = os.path.join(args.data, 'val') #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_dataset = torchvision.datasets.CIFAR10(root=args.data, train=True, download=True, transform=transform_train) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, sampler=train_sampler) testset = torchvision.datasets.CIFAR10(root=args.data, train=False, download=True, transform=transform_test) val_loader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) if args.evaluate: validate(val_loader, model, criterion, args) return epoch_start = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) adjust_mem_decay(model, epoch, args) # train for one epoch losses = train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best) f = open(args.data + "/result_" + args.file_name + ".txt", "a+") f.write( str(epoch + 1) + '\t' + str(time.time() - epoch_start) + '\t' + str(losses.avg) + '\t' + str(acc1.item()) + '\n') f.close()
def __init__( self, name: str, rank: int = -1, world_size: int = -1, init_dist: bool = True, init_rpc: bool = True, dist_backend: str = "gloo", dist_init_method: str = "tcp://localhost:9100", rpc_init_method: str = "tcp://localhost:9101", dist_timeout: float = 60, rpc_timeout: float = 60, ): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` dist_timeout: Distributed package timeout in seconds. rpc_timeout: Global rpc call timeout in seconds. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} if init_dist: dist.init_process_group( backend=dist_backend, init_method=dist_init_method, timeout=timedelta(seconds=dist_timeout), rank=rank, world_size=world_size, ) if init_rpc: rpc.init_rpc( self.name, rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=rpc_init_method, rpc_timeout=rpc_timeout), ) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]
help='set the inclusive lower limit for tensor size; ' + 'default: 19 (2**19 = 512 KB)') parser.add_argument('--min-num-tensors', dest='min_num_tensors', action='store', default=2, type=int, help='set the inclusive lower limit for the number of ' + 'tensors to be sent during one test run; ' + 'default: 2 (10**2 = 100)') args = parser.parse_args() MIN_NUM_TENSORS = args.min_num_tensors MIN_BYTES = args.min_bytes MAX_NUM_TENSORS = args.max_num_tensors + 1 MAX_BYTES = args.max_bytes + 1 dist.init_process_group(backend=os.environ['BACKEND']) rank = dist.get_rank() dist.barrier() if rank == 0: print_header("broadcast") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.broadcast(tensor, 0) end = timer() print_stats(bytes, num_tensors, end - start) print()
def __enter__(self): dist.init_process_group(*self.args, **self.kwargs)
def init_processes(rank, size, fn, backend='gloo'): """ Initialize the distributed environment. """ os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend, rank=rank, world_size=size) fn(rank, size)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def train( cfg, data_cfg, img_size=1024, resume=False, epochs=273, # 500200 batches at bs 64, dataset length 117263 batch_size=16, accumulate=1, multi_scale=False, freeze_backbone=False, transfer=True # Transfer learning (train only YOLO layers) ): init_seeds() weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: img_size = 1024 # initiate with maximum multi_scale size opt.num_workers = 0 # bug https://github.com/ultralytics/yolov3/issues/174 else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run train_path = parse_data_cfg(data_cfg)['train'] # Initialize model model = Darknet(cfg, img_size).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3-dota.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k) # lf = lambda x: 1 - x / epochs # linear ramp to zero # lf = lambda x: 10 ** (-2 * x / epochs) # exp ramp to lr0 * 1e-2 # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inv exp ramp to lr0 * 1e-2 # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1, last_epoch=start_epoch - 1) # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y) # Dataset dataset = LoadImagesAndLabels(train_path, img_size=img_size, augment=True) # Initialize distributed training if torch.cuda.device_count() > 1: dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) model = torch.nn.parallel.DistributedDataParallel(model) sampler = torch.utils.data.distributed.DistributedSampler(dataset) else: sampler = None # Dataloader dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=opt.num_workers, shuffle=False, pin_memory=True, collate_fn=dataset.collate_fn, sampler=sampler) # Mixed precision training https://github.com/NVIDIA/apex # install help: https://github.com/NVIDIA/apex/issues/259 mixed_precision = False if mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Start training t = time.time() model.hyp = hyp # attach hyperparameters to model model_info(model) nb = len(dataloader) results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches os.remove('train_batch0.jpg') if os.path.exists( 'train_batch0.jpg') else None os.remove('test_batch0.jpg') if os.path.exists('test_batch0.jpg') else None for epoch in range(start_epoch, epochs): model.train() print( ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True mloss = torch.zeros(5).to(device) # mean losses for i, (imgs, targets, _, _) in enumerate(dataloader): imgs = imgs.to(device) targets = targets.to(device) nt = len(targets) # if nt == 0: # if no targets continue # continue # Plot images with bounding boxes if epoch == 0 and i == 0: plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg') # SGD burn-in if epoch == 0 and i <= n_burnin: lr = hyp['lr0'] * (i / n_burnin)**4 for x in optimizer.param_groups: x['lr'] = lr # Run model pred = model(imgs) # Compute loss loss, loss_items = compute_loss(pred, targets, model) if torch.isnan(loss): print('WARNING: nan loss detected, ending training') return results # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nb: optimizer.step() optimizer.zero_grad() # Update running mean of tracked metrics mloss = (mloss * i + loss_items) / (i + 1) # Print batch results s = ('%8s%12s' + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nb - 1), *mloss, nt, time.time() - t) t = time.time() print(s) # Multi-Scale training (320 - 608 pixels) every 10 batches if multi_scale and (i + 1) % 10 == 0: dataset.img_size = random.choice(range(10, 20)) * 32 print('multi_scale img_size = %g' % dataset.img_size) # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) if not (opt.notest or (opt.nosave and epoch < 5)) or epoch == epochs - 1: with torch.no_grad(): results = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size, model=model, conf_thres=0.1) # Write epoch results with open('results.txt', 'a') as file: file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss # Update best loss test_loss = results[4] if test_loss < best_loss: best_loss = test_loss # Save training results save = True and not opt.nosave if save: # Create checkpoint chkpt = { 'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type(model) is nn.parallel.DistributedDataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } # Save latest checkpoint torch.save(chkpt, latest) # Save best checkpoint if best_loss == test_loss: torch.save(chkpt, best) # Save backup every 10 epochs (optional) if epoch > 0 and epoch % 10 == 0: torch.save(chkpt, weights + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt return results
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, local_rank=-1, evaluation_interval=5): total_batch = C.get()["batch"] if local_rank >= 0: dist.init_process_group(backend='nccl', init_method='env://', world_size=int(os.environ['WORLD_SIZE'])) device = torch.device('cuda', local_rank) torch.cuda.set_device(device) C.get()['lr'] *= dist.get_world_size() logger.info(f'local batch={C.get()["batch"]} world_size={dist.get_world_size()} ----> total batch={C.get()["batch"] * dist.get_world_size()}') total_batch = C.get()["batch"] * dist.get_world_size() is_master = local_rank < 0 or dist.get_rank() == 0 # is_master = true if is_master: add_filehandler(logger, 'lol.log') # 一个新的log文件 #add_filehandler(logger, args.save + '.log') # 无效path if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, multinode=(local_rank >= 0)) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=local_rank) model_ema = get_model(C.get()['model'], num_class(C.get()['dataset']), local_rank=-1) # model_ema是干嘛的 model_ema.eval() criterion_ce = criterion = CrossEntropyLabelSmooth(num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get().conf.get('mixup', 0.0) > 0.0: criterion = CrossEntropyMixUpLabelSmooth(num_class(C.get()['dataset']), C.get().conf.get('lb_smooth', 0)) if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=0.0, nesterov=C.get()['optimizer'].get('nesterov', True) ) elif C.get()['optimizer']['type'] == 'rmsprop': optimizer = RMSpropTF( model.parameters(), lr=C.get()['lr'], weight_decay=0.0, alpha=0.9, momentum=0.9, eps=0.001 ) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'efficientnet': scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 0.97 ** int((x + C.get()['lr_schedule']['warmup']['epoch']) / 2.4)) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None) and C.get()['lr_schedule']['warmup']['epoch'] > 0: scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler ) print('tag is ', tag) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']] if C.get()['optimizer']['ema'] > 0.0 and is_master: # https://discuss.pytorch.org/t/how-to-apply-exponential-moving-average-decay-for-variables/10856/4?u=ildoonet ema = EMA(C.get()['optimizer']['ema']) else: ema = None # 进入none result = OrderedDict() epoch_start = 1 if save_path != 'test.pth': # and is_master: --> should load all data(not able to be broadcasted) if save_path and os.path.exists(save_path): logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) key = 'model' if 'model' in data else 'state_dict' if 'epoch' not in data: model.load_state_dict(data) else: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, (DataParallel, DistributedDataParallel)): print('get in not parallel') model.load_state_dict({k.replace('module.', ''): v for k, v in data[key].items()}) else: print('get in parallel') model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data[key].items()}) logger.info('optimizer.load_state_dict+') optimizer.load_state_dict(data['optimizer']) # optimizer也加载上次状态 if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: print('only eval true') only_eval = True if ema is not None: ema.shadow = data.get('ema', {}) if isinstance(data.get('ema', {}), dict) else data['ema'].state_dict() del data else: logger.info('"%s" file not found. skip to pretrain weights...' % save_path) if only_eval: logger.warning('model checkpoint not found. only-evaluation mode is off.') only_eval = False if local_rank >= 0: for name, x in model.state_dict().items(): dist.broadcast(x, 0) logger.info(f'multinode init. local_rank={dist.get_rank()} is_master={is_master}') torch.cuda.synchronize() tqdm_disabled = bool(os.environ.get('TASK_NAME', '')) and local_rank != 0 # KakaoBrain Environment #? only eval的模式下,也要运行一个epoch来获得loss之类的 if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0], is_master=is_master) with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1], is_master=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2], is_master=is_master) if ema is not None and len(ema) > 0: model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()}) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=0, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=0, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result logger.debug('get into train stage') # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if local_rank >= 0: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=(is_master and local_rank <= 0), scheduler=scheduler, ema=ema, wd=C.get()['optimizer']['decay'], tqdm_disabled=tqdm_disabled) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if ema is not None and C.get()['optimizer']['ema_interval'] > 0 and epoch % C.get()['optimizer']['ema_interval'] == 0: logger.info(f'ema synced+ rank={dist.get_rank()}') if ema is not None: model.load_state_dict(ema.state_dict()) for name, x in model.state_dict().items(): # print(name) dist.broadcast(x, 0) torch.cuda.synchronize() logger.info(f'ema synced- rank={dist.get_rank()}') if is_master and (epoch % evaluation_interval == 0 or epoch == max_epoch): with torch.no_grad(): rs['valid'] = run_epoch(model, validloader, criterion_ce, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model, testloader_, criterion_ce, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) if ema is not None: model_ema.load_state_dict({k.replace('module.', ''): v for k, v in ema.state_dict().items()}) rs['valid'] = run_epoch(model_ema, validloader, criterion_ce, None, desc_default='valid(EMA)', epoch=epoch, writer=writers[1], verbose=is_master, tqdm_disabled=tqdm_disabled) rs['test'] = run_epoch(model_ema, testloader_, criterion_ce, None, desc_default='*test(EMA)', epoch=epoch, writer=writers[2], verbose=is_master, tqdm_disabled=tqdm_disabled) logger.info( f'epoch={epoch} ' f'[train] loss={rs["train"]["loss"]:.4f} top1={rs["train"]["top1"]:.4f} ' f'[valid] loss={rs["valid"]["loss"]:.4f} top1={rs["valid"]["top1"]:.4f} ' f'[test] loss={rs["test"]["loss"]:.4f} top1={rs["test"]["top1"]:.4f} ' ) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter( loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1'] ) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s, err=%.4f' % (epoch, save_path, 1 - best_top1)) torch.save({ 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict(), 'ema': ema.state_dict() if ema is not None else None, }, save_path) del model result['top1_test'] = best_top1 return result
def main(): global args, best_err1, best_err5 args = parser.parse_args() if args.tensorboard: configure("runs/%s" % (args.expname)) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) if args.dataset.startswith('cifar'): normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) if args.augment: transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: transform_train = transforms.Compose([ transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) if args.dataset == 'cifar100': train_loader = torch.utils.data.DataLoader( datasets.CIFAR100('../data', train=True, download=True, transform=transform_train), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.CIFAR100('../data', train=False, transform=transform_test), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) numberofclass = 100 elif args.dataset == 'cifar10': train_loader = torch.utils.data.DataLoader( datasets.CIFAR10('../data', train=True, download=True, transform=transform_train), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( datasets.CIFAR10('../data', train=False, transform=transform_test), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) numberofclass = 10 else: raise Exception('unknown dataset: {}'.format(args.dataset)) elif args.dataset == 'imagenet': traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) numberofclass = 1000 else: raise Exception('unknown dataset: {}'.format(args.dataset)) if args.pretrained: print("=> using pre-trained model '{}'".format(args.net_type)) try: model = models.__dict__[str(args.net_type)](pretrained=True) except (KeyError, TypeError): print('unknown model') print('torchvision provides the follwoing pretrained model:', model_names) return else: print("=> creating model '{}'".format(args.net_type)) if args.net_type == 'resnet': model = RN.ResNet(args.dataset, args.depth, numberofclass, args.bottleneck) # for ResNet elif args.net_type == 'preresnet': model = PRN.PreResNet(args.dataset, args.depth, numberofclass, args.bottleneck) # for Pre-activation ResNet elif args.net_type == 'pyramidnet': model = PYRM.PyramidNet(args.dataset, args.depth, args.alpha, numberofclass, args.bottleneck) # for ResNet else: raise Exception('unknown network architecture: {}'.format( args.net_type)) if not args.distributed: if args.net_type.startswith('alexnet') or args.net_type.startswith( 'vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) print(model) print('the number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_err1 = checkpoint['best_err1'] best_err5 = checkpoint['best_err5'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs + 1): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set err1, err5 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = err1 <= best_err1 best_err1 = min(err1, best_err1) if is_best: best_err5 = err5 print('Current best accuracy (top-1 and 5 error):', best_err1, best_err5) save_checkpoint( { 'epoch': epoch, 'arch': args.net_type, 'state_dict': model.state_dict(), 'best_err1': best_err1, 'best_err5': best_err5, 'optimizer': optimizer.state_dict(), }, is_best) print('Best accuracy (top-1 and 5 error):', best_err1, best_err5)
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=args.num_classes) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = traindir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator train_loader = DALIClassificationIterator(pipe, size = int(1281167 / args.world_size) ) pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = valdir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator val_loader = DALIClassificationIterator(pipe, size = int(50000 / args.world_size) ) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)
def train(rank, a, h, resume_run_id=None): if h.num_gpus > 1: init_process_group( backend=h.dist_config["dist_backend"], init_method=h.dist_config["dist_url"], world_size=h.dist_config["world_size"] * h.num_gpus, rank=rank, ) torch.cuda.manual_seed(h.seed) device = torch.device("cuda:{:d}".format(rank)) generator = Generator(h).to(device) mpd = MultiPeriodDiscriminator().to(device) msd = MultiScaleDiscriminator().to(device) if rank == 0: print(generator) os.makedirs(a.checkpoint_path, exist_ok=True) print("checkpoints directory : ", a.checkpoint_path) cp_g = None cp_do = None if resume_run_id: restored_g = wandb.restore("g_latest") cp_g = restored_g.name restored_do = wandb.restore("do_latest") cp_do = restored_do.name steps = 0 if cp_g is None or cp_do is None: state_dict_do = None last_epoch = -1 else: state_dict_g = load_checkpoint(cp_g, device) state_dict_do = load_checkpoint(cp_do, device) generator.load_state_dict(state_dict_g["generator"]) mpd.load_state_dict(state_dict_do["mpd"]) msd.load_state_dict(state_dict_do["msd"]) steps = state_dict_do["steps"] + 1 last_epoch = state_dict_do["epoch"] if h.num_gpus > 1: generator = DistributedDataParallel(generator, device_ids=[rank]).to(device) mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device) msd = DistributedDataParallel(msd, device_ids=[rank]).to(device) optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) optim_d = torch.optim.AdamW( itertools.chain(msd.parameters(), mpd.parameters()), h.learning_rate, betas=[h.adam_b1, h.adam_b2], ) if state_dict_do is not None: optim_g.load_state_dict(state_dict_do["optim_g"]) optim_d.load_state_dict(state_dict_do["optim_d"]) scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch) training_filelist, validation_filelist = get_dataset_filelist(a) trainset = MelDataset( training_filelist, h.segment_size, h.n_fft, h.num_mels, h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0, shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir, ) print(f"train dataset size:{len(trainset)}") train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None train_loader = DataLoader( trainset, num_workers=h.num_workers, shuffle=False, sampler=train_sampler, batch_size=h.batch_size, pin_memory=True, drop_last=True, ) if rank == 0: validset = MelDataset( validation_filelist, h.segment_size, h.n_fft, h.num_mels, h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0, fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir, ) print(f"valid dataset size:{len(validset)}") validation_loader = DataLoader( validset, num_workers=1, shuffle=False, sampler=None, batch_size=1, pin_memory=True, drop_last=True, ) sw = SummaryWriter(os.path.join(a.checkpoint_path, "logs")) generator.train() mpd.train() msd.train() for epoch in range(max(0, last_epoch), a.training_epochs): if rank == 0: start = time.time() print("Epoch: {}".format(epoch + 1)) if h.num_gpus > 1: train_sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): if rank == 0: start_b = time.time() x, y, _, y_mel = batch x = torch.autograd.Variable(x.to(device, non_blocking=True)) y = torch.autograd.Variable(y.to(device, non_blocking=True)) y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True)) y = y.unsqueeze(1) y_g_hat = generator(x) y_g_hat_mel = mel_spectrogram( y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax_for_loss, ) optim_d.zero_grad() # MPD y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach()) loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss( y_df_hat_r, y_df_hat_g) # MSD y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach()) loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss( y_ds_hat_r, y_ds_hat_g) loss_disc_all = loss_disc_s + loss_disc_f loss_disc_all.backward() optim_d.step() # Generator optim_g.zero_grad() # L1 Mel-Spectrogram Loss loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45 y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat) y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat) loss_fm_f = feature_loss(fmap_f_r, fmap_f_g) loss_fm_s = feature_loss(fmap_s_r, fmap_s_g) loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g) loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g) loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel loss_gen_all.backward() optim_g.step() if rank == 0: # STDOUT logging if steps % a.stdout_interval == 0: with torch.no_grad(): mel_error = F.l1_loss(y_mel, y_g_hat_mel).item() print( "Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}" .format(steps, loss_gen_all, mel_error, time.time() - start_b)) wandb.log( { "loss/Gen Loss Total": loss_gen_all, "loss/Mel-Spec. Error": mel_error, }, step=steps, ) # checkpointing if steps % a.checkpoint_interval == 0 and steps != 0: # generator checkpoint_path = "{}/g_{:08d}".format( a.checkpoint_path, steps) save_checkpoint( checkpoint_path, { "generator": (generator.module if h.num_gpus > 1 else generator).state_dict() }, ) checkpoint_name = "g_{:08d}".format(steps) wandb.save(checkpoint_name) # also save as latest checkpoint_path = "{}/g_latest".format(a.checkpoint_path) save_checkpoint( checkpoint_path, { "generator": (generator.module if h.num_gpus > 1 else generator).state_dict() }, ) wandb.save("g_latest") # discriminator checkpoint_path = "{}/do_{:08d}".format( a.checkpoint_path, steps) save_checkpoint( checkpoint_path, { "mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(), "msd": (msd.module if h.num_gpus > 1 else msd).state_dict(), "optim_g": optim_g.state_dict(), "optim_d": optim_d.state_dict(), "steps": steps, "epoch": epoch, }, ) checkpoint_name = "do_{:08d}".format(steps) wandb.save(checkpoint_name) # also save as latest checkpoint_path = "{}/do_latest".format(a.checkpoint_path) save_checkpoint( checkpoint_path, { "mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(), "msd": (msd.module if h.num_gpus > 1 else msd).state_dict(), "optim_g": optim_g.state_dict(), "optim_d": optim_d.state_dict(), "steps": steps, "epoch": epoch, }, ) wandb.save("do_latest") # Tensorboard summary logging if steps % a.summary_interval == 0: sw.add_scalar("training/gen_loss_total", loss_gen_all, steps) sw.add_scalar("training/mel_spec_error", mel_error, steps) # Validation if steps % a.validation_interval == 0: # and steps != 0: generator.eval() torch.cuda.empty_cache() val_err_tot = 0 with torch.no_grad(): samples_orig = [] samples_pred = [] for j, batch in enumerate(validation_loader): x, y, _, y_mel = batch y_g_hat = generator(x.to(device)) y_mel = torch.autograd.Variable( y_mel.to(device, non_blocking=True)) y_g_hat_mel = mel_spectrogram( y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax_for_loss, ) val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item() if j <= 4: if steps == 0: sw.add_audio( "gt/y_{}".format(j), y[0], steps, h.sampling_rate, ) sw.add_figure( "gt/y_spec_{}".format(j), plot_spectrogram(x[0]), steps, ) # log orig audio to wandb orig_audio = y.squeeze().cpu() samples_orig.append( wandb.Audio( orig_audio, caption=f"sample {i}", sample_rate=h.sampling_rate, )) sw.add_audio( "generated/y_hat_{}".format(j), y_g_hat[0], steps, h.sampling_rate, ) y_hat_spec = mel_spectrogram( y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax, ) sw.add_figure( "generated/y_hat_spec_{}".format(j), plot_spectrogram( y_hat_spec.squeeze(0).cpu().numpy()), steps, ) # log pred audio to wandb pred_audio = y_g_hat.squeeze().cpu() samples_pred.append( wandb.Audio( pred_audio, caption=f"sample {i}", sample_rate=h.sampling_rate, )) val_err = val_err_tot / (j + 1) sw.add_scalar("validation/mel_spec_error", val_err, steps) # log audios to wandb wandb.log( { "audio/generated": samples_pred, }, step=steps, ) if steps == 0: wandb.log( { "audio/original": samples_orig, }, step=steps, ) generator.train() steps += 1 scheduler_g.step() scheduler_d.step() if rank == 0: print("Time taken for epoch {} is {} sec\n".format( epoch + 1, int(time.time() - start)))
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir)
def init_dist(args): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='env://', world_size=torch.cuda.device_count(), rank=args.local_rank)
def main_worker(process_id, ngpus_per_node, args): args.gpu = process_id model = Net() criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.1) if args.multiprocessing_distributed: args.rank = args.rank_start + process_id dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) print("Process {} in node {} has been started.".format( args.rank, args.node)) torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], output_device=args.gpu) train_sampler = torch.utils.data.distributed.DistributedSampler( get_mnist(istrain=True), num_replicas=None, rank=None) test_sampler = torch.utils.data.distributed.DistributedSampler( get_mnist(istrain=False), num_replicas=None, rank=None) #different form train_loader = torch.utils.data.DataLoader(get_mnist(istrain=True), batch_size=args.batch_size, shuffle=None, num_workers=args.workers, pin_memory=True, sampler=train_sampler) test_loader = torch.utils.data.DataLoader(get_mnist(istrain=False), batch_size=args.batch_size, shuffle=None, num_workers=args.workers, pin_memory=True, sampler=test_sampler) acc_train = AverageMeter() acc_test = AverageMeter() best_acc = 0 for epoch in range(5): if args.distributed: train_sampler.set_epoch(epoch) #train epoch for i, (input, target) in enumerate(train_loader): input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) output = model(input) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() acc = accuracy(output, target) acc_train.update(acc[0], n=input.size(0)) #test epoch for i, (input, target) in enumerate(test_loader): input = input.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) with torch.no_grad(): output = model(input) acc = accuracy(output, target) acc_test.update(acc[0], n=input.size(0)) #the performances differ since different data are used print("rank {}th process after epoch{}: train_acc{:.2f},val_acc{:.2f}". format(args.rank, epoch, acc_train.avg.item(), acc_test.avg.item())) is_best = acc_test.avg.item() > best_acc best_acc = max(acc_test.avg.item(), best_acc) acc_train.reset() acc_test.reset() # save once per node if args.rank % args.gpu_use == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.arch=='alexnet': model = model_list.alexnet(pretrained=args.pretrained) input_size = 227 else: raise Exception('Model not supported yet') if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code if not os.path.exists(args.data+'/imagenet_mean.binaryproto'): print("==> Data directory"+args.data+"does not exits") print("==> Please specify the correct data path by") print("==> --data <DATA_PATH>") return normalize = transforms.Normalize( meanfile=args.data+'/imagenet_mean.binaryproto') train_dataset = datasets.ImageFolder( args.data, transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, transforms.RandomSizedCrop(input_size), ]), Train=True) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(args.data, transforms.Compose([ transforms.ToTensor(), normalize, transforms.CenterCrop(input_size), ]), Train=False), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) print model if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.network_name == 'vgg19': model = vgg19_bn(pretrained=False) elif args.network_name == 'vgg16': model = vgg16_bn(pretrained=False) elif 'resnet' in args.network_name: model = models.__dict__[args.arch](pretrained=False) else: raise NotImplementedError # Initialize network for layer in model.modules(): if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear): if args.init == 'normal_kaiming': nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') elif args.init == 'normal_kaiming_fout': nn.init.kaiming_normal_(layer.weight, nonlinearity='relu', mode='fan_out') elif args.init == 'normal_xavier': nn.init.xavier_normal_(layer.weight) elif args.init == 'orthogonal': nn.init.orthogonal_(layer.weight) else: raise ValueError( f"Unrecognised initialisation parameter {args.init}") model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) ############################# #### Pruning code #### ############################# pruning_factor = args.pruning_factor keep_masks = [] filename = '' if pruning_factor != 1: print(f'Pruning network iteratively for {args.num_steps} steps') keep_masks = iterative_pruning(model, train_loader, device, pruning_factor, prune_method=args.prune_method, num_steps=args.num_steps, mode=args.mode, num_batches=args.num_batches) apply_prune_mask(model, keep_masks) # File where to save training history run_name = (args.network_name + '_IMAGENET' + '_spars' + str(1 - pruning_factor) + '_variant' + str(args.prune_method) + '_train-frac' + str(args.frac_data_for_train) + f'_steps{args.num_steps}_{args.mode}' + f'_{args.init}' + f'_batch{args.num_batches}' + f'_rseed_{seed}') writer_name = 'runs/' + run_name writer = SummaryWriter(writer_name) if args.evaluate: validate(val_loader, model, criterion, args) return iterations = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # Evaluate on validation set iterations = epoch * len(train_loader) acc1 = validate(val_loader, model, criterion, args, writer, iterations) # Save checkpoint if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if (epoch + 1) % 5 == 0: if not os.path.exists('saved_models/'): os.makedirs('saved_models/') save_name = 'saved_models/' + run_name + '_cross_entropy_' + str( epoch + 1) + '.model' torch.save(model.state_dict(), save_name) elif (epoch + 1) == args.epochs: if not os.path.exists('saved_models/'): os.makedirs('saved_models/') save_name = 'saved_models/' + run_name + '_cross_entropy_' + str( epoch + 1) + '.model' torch.save(model.state_dict(), save_name)
raise unittest.SkipTest("Compiled without the " + BACKEND + " backend") if skip_ok: # do this first so we don't give an error message about # mismatched exit codes if the first isn't valid assert first_process.exitcode == 0 \ or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE \ or first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE \ or first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE: raise unittest.SkipTest("cuda is not available") if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE: raise unittest.SkipTest("One unique gpu per process is not available") if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE: raise unittest.SkipTest("worldsize is too small to run group tests") self.assertEqual(first_process.exitcode, 0) elif BACKEND == 'mpi': WORLD_SIZE = os.environ['WORLD_SIZE'] dist.init_process_group(init_method=INIT_METHOD, backend='mpi') class TestMPI(TestCase, _DistTestBase): pass if __name__ == '__main__': assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process" unittest.main()
def train(args): world_size = len(args.hosts) is_distributed = world_size > 1 logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed)) use_cuda = args.num_gpus > 0 logger.debug('Number of gpus available - {}'.format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device('cuda' if use_cuda else 'cpu') if is_distributed: # Initialize the distributed environment. backend = 'gloo' os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) # set the seed for generating random numbers seed = 1 torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs) test_loader = _get_test_data_loader(args.data_dir, **kwargs) logger.debug('Processes {}/{} ({:.0f}%) of train data'.format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug('Processes {}/{} ({:.0f}%) of test data'.format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.') model = torch.nn.parallel.DistributedDataParallel(model) elif use_cuda: # single-machine multi-gpu case logger.debug('Single-machine multi-gpu: using DataParallel().cuda().') model = torch.nn.DataParallel(model) else: # single-machine or multi-machine cpu case logger.debug('Single-machine/multi-machine cpu: using DataParallel.') model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5) log_interval = 100 for epoch in range(1, args.epochs + 1): if is_distributed: train_sampler.set_epoch(epoch) model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % log_interval == 0: logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) accuracy = test(model, test_loader, device) save_model(model, args.model_dir) logger.debug('Overall test accuracy: {}'.format(accuracy))
def init_process_group(world_size, rank): dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:12345', world_size=world_size, rank=rank)
def process(rank, world_size, train_pairs, test_pairs, resume): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' dist.init_process_group("nccl", rank=rank, world_size=world_size) device = rank train_sampler = torch.utils.data.distributed.DistributedSampler( train_pairs, num_replicas=world_size, rank=rank, shuffle=False) # dataset_train = DataLoader(train_pairs, batch_size=BATCH_SIZE, # shuffle=True, num_workers=NUM_WORKERS, # collate_fn=collate_function, # pin_memory=True) dataset_train = DataLoader(train_pairs, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_function, pin_memory=True, sampler=train_sampler) dataset_test = DataLoader(test_pairs, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_function, drop_last=True, pin_memory=True) model = TransformerSTT(**model_parameters) # model = nn.DataParallel(model) model = model.to(device) model = DDP(model, find_unused_parameters=True, device_ids=[rank]) # print(str(model)) learning_rate = LR optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) loss_criterion = nn.CTCLoss(zero_infinity=True) train_step = 0 model, optimizer, train_step, writer = resume_training( resume, model, optimizer, train_step, rank) scaler = GradScaler() loss_list = list() wer_list = list() for epoch in range(NUM_EPOCH): model.train() for data in tqdm(dataset_train): mel_tensor, jamo_code_tensor, mel_lengths, jamo_lengths, mel_transformer_mask, speakers = data # speaker_code = speaker_table.speaker_name_to_code(speakers) with autocast(): output_tensor = model(( mel_tensor.to(device), mel_transformer_mask.to(device), )) output_tensor = output_tensor.permute( 1, 0, 2) # (N, S, E) => (T, N, C) loss = loss_criterion(output_tensor, jamo_code_tensor.to(device), (mel_lengths // 8).to(device), jamo_lengths.to(device)) optimizer.zero_grad() # loss.backward() # optimizer.step() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() train_step += 1 if rank == 0: decoded_input_text = KOREAN_TABLE.decode_jamo_code_tensor( jamo_code_tensor) decoded_input_text = KOREAN_TABLE.decode_ctc_prediction( decoded_input_text) decoded_output_text = KOREAN_TABLE.decode_jamo_prediction_tensor( output_tensor) decoded_output_str = KOREAN_TABLE.decode_ctc_prediction( decoded_output_text) wer = KOREAN_TABLE.caculate_wer(decoded_input_text, decoded_output_str) wer_list.append(wer) loss_list.append(loss.item()) if len(loss_list) >= LOGGING_STEPS: writer.add_scalar('ctc_loss/train', np.mean(loss_list), train_step) decoded_pairs = [f'** {in_text} \n\n -> {out_text} \n\n => {final_output} \n\n' \ for (in_text, out_text, final_output) in zip(decoded_input_text, decoded_output_text, decoded_output_str)] writer.add_text('text_result/train', '\n\n'.join(decoded_pairs), train_step) writer.add_scalar('WER/train', np.mean(wer_list), train_step) logging_image = mel_tensor_to_plt_image( mel_tensor, decoded_input_text, train_step) writer.add_image('input_spectrogram/train', logging_image, train_step) print(f'Train Step {train_step}') loss_list = list() wer_list = list() if train_step % CHECKPOINT_STEPS == 0: save_checkpoint(model, optimizer, train_step, writer.logdir, KEEP_LAST_ONLY) # break if rank == 0: loss_test_list = list() wer_test_list = list() model.eval() for data in tqdm(dataset_test): mel_tensor, jamo_code_tensor, mel_lengths, jamo_lengths, mel_transformer_mask, speakers = data with autocast(): output_tensor = model(( mel_tensor.to(device), mel_transformer_mask.to(device), )) output_tensor = output_tensor.permute( 1, 0, 2) # (N, S, E) => (T, N, C) loss = loss_criterion(output_tensor, jamo_code_tensor.to(device), (mel_lengths // 8).to(device), jamo_lengths.to(device)) loss_test_list.append(loss.item()) decoded_input_text = KOREAN_TABLE.decode_jamo_code_tensor( jamo_code_tensor) decoded_input_text = KOREAN_TABLE.decode_ctc_prediction( decoded_input_text) decoded_output_text = KOREAN_TABLE.decode_jamo_prediction_tensor( output_tensor) decoded_output_str = KOREAN_TABLE.decode_ctc_prediction( decoded_output_text) wer = KOREAN_TABLE.caculate_wer(decoded_input_text, decoded_output_str) wer_test_list.append(wer) decoded_pairs = [f'** {in_text} \n\n -> {out_text} \n\n => {final_output} \n\n' \ for (in_text, out_text, final_output) in zip(decoded_input_text, decoded_output_text, decoded_output_str)] writer.add_scalar('ctc_loss/test', np.mean(loss_test_list), train_step) writer.add_scalar('WER/test', np.mean(wer_test_list), train_step) writer.add_text('text_result/test', '\n\n'.join(decoded_pairs), train_step) logging_image = mel_tensor_to_plt_image(mel_tensor, decoded_input_text, train_step) writer.add_image('input_spectrogram/test', logging_image, train_step)
def _train(args): is_distributed = len(args.hosts) > 1 and args.dist_backend is not None logger.debug("Distributed training - {}".format(is_distributed)) if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.dist_backend, dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info("Device Type: {}".format(device)) logger.info("Loading Cifar10 dataset") transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True, download=False, transform=transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False, download=False, transform=transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) logger.info("Model loaded") model = Net() if torch.cuda.device_count() > 1: logger.info("Gpu count: {}".format(torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(0, args.epochs): running_loss = 0.0 for i, data in enumerate(train_loader): # get the inputs inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training') return _save_model(model, args.model_dir)
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) ###################### if args.arch.startswith('vgg'): mod = list(model.classifier.children()) mod.pop() mod.append(torch.nn.Linear(4096, 30)) new_classifier = torch.nn.Sequential(*mod) model.classifier = new_classifier elif args.arch.startswith('alexnet'): mod = list(model.classifier.children()) mod.pop() mod.append(torch.nn.Linear(4096, 30)) new_classifier = torch.nn.Sequential(*mod) model.classifier = new_classifier else: model.fc=torch.nn.Linear(2048, 30) print(model) ######################## # mod = list(model.children()) # mod.pop() # new_classifier = torch.nn.Sequential(*mod) # model = new_classifier # model.cuda() class MyResNet(nn.Module): def __init__(self, model): super(MyResNet, self).__init__() self.conv1 = model.conv1 self.bn1 = model.bn1 self.relu = model.relu self.maxpool = model.maxpool self.layer1 = model.layer1 self.layer2 = model.layer2 self.layer3 = model.layer3 self.layer4 = model.layer4 self.avgpool = model.avgpool #self.Net_classifier=Net_classifier def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) #x = x.view(x.size(0), -1) #x = self.Net_classifier(x) return x my_model=MyResNet(model) class SiameseNetwork(nn.Module): def __init__(self): super(SiameseNetwork, self).__init__() self.cnn1 = my_model def forward_once(self, x): output = self.cnn1(x) #output = output.view(output.size()[0], -1) #output = self.fc1(output) return output def forward(self, input1): output1 = self.forward_once(input1) #output2 = self.forward_once(input2) #return output1, output2 return output1 ############################################# this_model = SiameseNetwork().cuda() model = this_model.cuda() print(model) model.cuda() # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) #args.start_epoch = checkpoint['epoch'] #best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint) #optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint succss") # .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) ###################### # logpath_val='./log/test/' # txtname='test.csv' # if not os.path.exists(logpath_val): # os.makedirs(logpath_val) # if os.path.exists(logpath_val+txtname): # os.remove(logpath_val+txtname) # f_val=file(logpath_val+txtname,'a+') ################# testdir='/home/zq610/WYZ/JD_contest/test/test_B/' cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform=transforms.Compose([ transforms.Scale([224,224]), transforms.ToTensor(), normalize,]) #################################################### #class_index=[1,10,11,12,13,14,15,16,17,18,19,2,20,21,22,23,24,25,26,27,28,29,3,30,4,5,6,7,8,9] model.eval() test_list = get_files(testdir) num=len(test_list) featuresall=np.zeros((num,2048)) images = np.array([]) for i, item in enumerate(test_list): print('Processing %i of %i (%s)' % (i+1, len(test_list), item)) test_image_name=item.split('/')[-1].split('.')[0] test_image = transform(Image.open(item)) input_var = torch.autograd.Variable(test_image, volatile=True).unsqueeze(0).cuda()##[1,3,224,224] output=model(input_var) features=output.data[0][:,:,0].t().cpu().numpy()### feature vector: [1*2048] featuresall[i,:]=features np.save(args.data+'../test_B_resnet50.npy',featuresall) for classes in range(30): valdir = os.path.join(args.data,str(classes+1)) test_list = get_files(valdir) num=len(test_list) featuresall=np.zeros((num,2048)) images = np.array([]) for i, item in enumerate(test_list): print('Processing %i of %i (%s)' % (i+1, len(test_list), item)) test_image_name=item.split('/')[-1].split('.')[0] test_image = transform(Image.open(item)) input_var = torch.autograd.Variable(test_image, volatile=True).unsqueeze(0).cuda()##[1,3,224,224] output=model(input_var) features=output.data[0][:,:,0].t().cpu().numpy()### feature vector: [1*2048] featuresall[i,:]=features np.save(args.data+str(classes+1)+'_resnet50.npy',featuresall)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) sys.stdout.flush() if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) sys.stdout.flush() if args.arch == 'fc_resnet50': model = fc_resnet50(num_classes=20, pretrained=False) else: model = models.__dict__[args.arch](pretrained=True) if args.arch in ['resnet18']: model.fc = torch.nn.Linear(512, len(CLASSES)) elif 'squeezenet' in args.arch: model.classifier[1] = nn.Conv2d(512, len(CLASSES), (1, 1), (1, 1)) model.num_classes = len(CLASSES) elif 'resnet' in args.arch: model.fc = torch.nn.Linear(2048, len(CLASSES)) else: print("=> creating model '{}'".format(args.arch)) sys.stdout.flush() model = models.__dict__[args.arch](num_classes=len(CLASSES)) #ipdb.set_trace() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer #criterion = nn.CrossEntropyLoss().cuda(args.gpu) criterion = nn.BCEWithLogitsLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) sys.stdout.flush() cudnn.benchmark = True # Data loading code #traindir = os.path.join(args.data, 'train') #valdir = os.path.join(args.data, 'test') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #train_dataset = datasets.ImageFolder( # traindir, # transforms.Compose([ # transforms.RandomResizedCrop(224), # #transforms.Resize((224,224)), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, # ])) img_transform = transforms.Compose([ #transforms.RandomResizedCrop(224), transforms.Resize((448, 448)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) train_dataset = Voc2007Classification(args.dir_datasets, 'train', transform=img_transform) val_dataset = Voc2007Classification(args.dir_datasets, 'val', transform=img_transform) test_dataset = Voc2007Classification(args.dir_datasets, 'test', transform=img_transform) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler) test_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler) #val_loader = torch.utils.data.DataLoader( # datasets.ImageFolder(valdir, transforms.Compose([ # transforms.Resize(256), # transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, # ])), # batch_size=256, shuffle=False, # num_workers=args.workers, pin_memory=True) if args.evaluate: #validate(val_loader, model, criterion, args, writer=writer) validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch #train(train_loader, model, criterion, optimizer, epoch, args, writer, logdir) train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set #acc1 = validate(val_loader, model, criterion, args, writer, logdir) acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, name=args.name)
def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name): url = "file://" + temp_file_name dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size) device = torch.device("cuda") torch.cuda.set_device(rank) torch.manual_seed(rank) np.random.seed(rank) # Any model works. Add one different buffer per rank model = Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3)) model.register_buffer("test_buffer", torch.ones((1)) * rank) model.to(device) n_half_params = len(list(model.parameters())) // 2 sharded_optimizer = OSS(params=list(model.parameters())[:n_half_params], optim=torch.optim.SGD, lr=1e-3, momentum=0.99) sharded_optimizer_2 = OSS(params=list(model.parameters())[n_half_params:], optim=torch.optim.SGD, lr=1e-3, momentum=0.99) sharded_ddp_model = ShardedDataParallel( module=model, sharded_optimizer=sharded_optimizer, broadcast_buffers=True) ddp_model_single = copy.deepcopy(model) ddp_optimizer = torch.optim.SGD(list( ddp_model_single.parameters())[:n_half_params], lr=1e-3, momentum=0.99) ddp_optimizer_2 = torch.optim.SGD(list( ddp_model_single.parameters())[n_half_params:], lr=1e-3, momentum=0.99) ddp_model = DDP(ddp_model_single, device_ids=[rank], broadcast_buffers=True) def check_same_model_params(): for pg, ddp_pg in zip(sharded_optimizer.param_groups, ddp_optimizer.param_groups): for p, ddp_p in zip(pg["params"], ddp_pg["params"]): assert torch.allclose( p, ddp_p, atol=1e-3 ), f"Model parameters differ in between DDP and ShardedDDP {p} {ddp_p}" for b, ddp_b in zip(sharded_ddp_model.buffers(), ddp_model.buffers()): assert torch.allclose( b, ddp_b, atol=1e-3 ), "Model buffers differ in between DDP and ShardedDDP" check_same_model_params( ) # The models should stay the same in between the ranks for i in range(20): input_tensor = torch.rand((64, 2)).to(device) # Run DDP ddp_optimizer.zero_grad() ddp_optimizer_2.zero_grad() ddp_loss = ddp_model(input_tensor).abs().sum() ddp_loss.backward() ddp_optimizer.step() ddp_optimizer_2.step() # Run Sharded sharded_optimizer.zero_grad() sharded_optimizer_2.zero_grad() sharded_loss = sharded_ddp_model(input_tensor).abs().sum() sharded_loss.backward() sharded_optimizer.step() sharded_optimizer_2.step() check_same_model_params() dist.destroy_process_group()
def run(args): world_size, rank = None, None if args.dist_backend in ['mpi', 'ddl']: lrank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK')) dist.init_process_group(backend=args.dist_backend, init_method='env://') world_size = dist.get_world_size() rank = dist.get_rank() args.distributed = True device = torch.device("cuda:{}".format(lrank)) # cudnn.benchmark = True # torch.cuda.set_device(args.local_rank) # # will read env master_addr master_port world_size # torch.distributed.init_process_group(backend='nccl', init_method="env://") # args.world_size = dist.get_world_size() # args.rank = dist.get_rank() # # args.local_rank = int(os.environ.get('LOCALRANK', args.local_rank)) # args.total_batch_size = (args.batch_size) * dist.get_world_size() global resume_iter """model_log""" input_size_r = list(args.input_size) delta_r = list(args.delta) """model_construction""" model = FFN(in_channels=4, out_channels=1, input_size=args.input_size, delta=args.delta, depth=args.depth).to(device) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel(model) torch.cuda.set_enabled_lms(True) """data_load""" if args.resume is not None: model.load_state_dict(torch.load(args.resume)) abs_path_training_data = args.train_data_dir entries_train_data = Path(abs_path_training_data) files_train_data = [] for entry in entries_train_data.iterdir(): files_train_data.append(entry.name) sorted_files_train_data = natsort.natsorted(files_train_data, reverse=False) files_total = len(sorted_files_train_data) input_h5data_dict = {} train_dataset_dict = {} train_loader_dict = {} batch_it_dict = {} train_sampler_dict = {} for index in range(files_total): input_h5data_dict[index] = [ (abs_path_training_data + sorted_files_train_data[index]) ] print(input_h5data_dict[index]) train_dataset_dict[index] = BatchCreator(input_h5data_dict[index], args.input_size, delta=args.delta, train=True) train_sampler_dict[ index] = torch.utils.data.distributed.DistributedSampler( train_dataset_dict[index], num_replicas=world_size, rank=rank, shuffle=True) train_loader_dict[index] = DataLoader( train_dataset_dict[index], num_workers=0, sampler=train_sampler_dict[index], pin_memory=True) batch_it_dict[index] = get_batch( train_loader_dict[index], args.batch_size, args.input_size, partial(fixed_offsets, fov_moves=train_dataset_dict[index].shifts)) best_loss = np.inf """optimizer""" t_last = time.time() cnt = 0 tp = fp = tn = fn = 0 optimizer = optim.Adam(model.parameters(), lr=args.lr) #optimizer = optim.SGD(model.parameters(), lr=1e-3) #momentum=0.9 #optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.step, gamma=args.gamma, last_epoch=-1) """train_loop""" while cnt < args.iter: cnt += 1 Num_of_train_data = len(input_h5data_dict) index_rand = random.randrange(0, Num_of_train_data, 1) seeds, images, labels, offsets = next(batch_it_dict[index_rand]) #print(sorted_files_train_data[index_rand]) #seeds = seeds.cuda(non_blocking=True) images = images.to(device) labels = labels.to(device) #offsets = offsets.cuda(non_blocking=True) t_curr = time.time() #labels = labels.cuda() torch_seed = torch.from_numpy(seeds).to(device) input_data = torch.cat([images, torch_seed], dim=1) #input_data = Variable(input_data.to(device)) logits = model(input_data) updated = torch_seed + logits optimizer.zero_grad() loss = F.binary_cross_entropy_with_logits(updated, labels) loss.backward() #torch.nn.utils.clip_grad_value_(model.parameters(), args.clip_grad_thr) optimizer.step() seeds[...] = updated.detach().cpu().numpy() pred_mask = (updated >= logit(0.9)).detach().cpu().numpy() true_mask = (labels > 0.5).cpu().numpy() true_bg = np.logical_not(true_mask) pred_bg = np.logical_not(pred_mask) tp += (true_mask & pred_mask).sum() fp += (true_bg & pred_mask).sum() fn += (true_mask & pred_bg).sum() tn += (true_bg & pred_bg).sum() precision = 1.0 * tp / max(tp + fp, 1) recall = 1.0 * tp / max(tp + fn, 1) accuracy = 1.0 * (tp + tn) / (tp + tn + fp + fn) if rank == 0: print( '[Iter_{}:, loss: {:.4}, Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%]\r' .format(cnt, loss.item(), precision * 100, recall * 100, accuracy * 100)) #scheduler.step() """model_saving_(best_loss)""" """ if best_loss > loss.item() or t_curr - t_last > args.interval: tp = fp = tn = fn = 0 t_last = t_curr best_loss = loss.item() input_size_r = list(args.input_size) delta_r = list(args.delta) torch.save(model.state_dict(), os.path.join(args.save_path, 'ffn_model_fov:{}_delta:{}_depth:{}.pth'.format(input_size_r[0], delta_r[0], args.depth))) print('Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'.format( precision * 100, recall * 100, accuracy * 100)) """ """model_saving_(iter)""" if (cnt % args.save_interval) == 0 and rank == 0: tp = fp = tn = fn = 0 #t_last = t_curr #best_loss = loss.item() input_size_r = list(args.input_size) delta_r = list(args.delta) torch.save( model.state_dict(), os.path.join( args.save_path, (str(args.stream) + 'ffn_model_fov:{}_delta:{}_depth:{}_recall{}.pth'.format( input_size_r[0], delta_r[0], args.depth, recall * 100)))) print( 'Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!' .format(precision * 100, recall * 100, accuracy * 100))
def run_one_step(rank, world_size, backend, device, temp_file_name): url = "file://" + temp_file_name dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size) if device == torch.device("cuda"): torch.cuda.set_device(rank) torch.manual_seed(rank) np.random.seed(rank) def check(broadcast_buffers: bool, grad_accumulation: bool = False) -> None: # Any model works. Add one different buffer per rank model = Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3)) model.register_buffer("test_buffer", torch.ones((1)) * rank) model.to(device) optimizer = OSS(params=model.parameters(), optim=torch.optim.SGD, lr=0.01, momentum=0.99) ddp_model = ShardedDataParallel(model, optimizer, broadcast_buffers=broadcast_buffers) def check_same_model_params(same_params: bool): # Check that all the params are the same on all ranks # This should be true with and without broadcast_buffers, we don't have any real buffer here receptacle: List[torch.Tensor] = [] if dist.get_backend() != "nccl": for pg in optimizer.param_groups: for p in pg["params"]: # Check the params receptacle = [p.clone() for _ in range(world_size) ] if rank == 0 else [] dist.gather(p, receptacle, dst=0) if rank == 0: for sync_p in receptacle[1:]: if same_params: assert torch.all( torch.eq(receptacle[0], sync_p) ), "Models differ in between ranks" else: assert not torch.all( torch.eq(receptacle[0], sync_p) ), "Gradients should not have been synced" # Check that all the buffers are in sync (authoritative rank is 0, its buffer is 0) if broadcast_buffers: for b in ddp_model.buffers(): receptacle = [b.clone() for _ in range(world_size) ] if rank == 0 else [] dist.gather(b, receptacle, dst=0) if rank == 0: for sync_b in receptacle[1:]: if same_params: assert torch.all( torch.eq(receptacle[0], sync_b) ), "Models differ in between ranks" else: assert not torch.all( torch.eq(receptacle[0], sync_b) ), "Gradients should not have been synced" assert b.cpu().item() == 0.0 # The model should be synchronized in between the ranks at ShardedDataParallel construction time, check that check_same_model_params(same_params=True) # Optim loop def closure(): optimizer.zero_grad() with ddp_model.no_sync() if grad_accumulation else suppress(): input_tensor = torch.rand((64, 2)).to(device) loss = ddp_model(input_tensor).abs().sum() loss.backward() return loss # The models should stay the same in between the ranks for i in range(5): _ = optimizer.step(closure=closure) # when running on cpu/gloo the "nodes" are not really different same_params = device == torch.device("cpu") or grad_accumulation check_same_model_params(same_params=same_params) check(broadcast_buffers=False) check(broadcast_buffers=True) check(broadcast_buffers=False, grad_accumulation=True) check(broadcast_buffers=True, grad_accumulation=True) dist.destroy_process_group()
help='Initial kl coefficient.') parser.add_argument('--gamma', type=float, default=0.995, help='Discount factor.') parser.add_argument('--lam', type=float, default=1, help='Tradeoff value of bias variance.') args = parser.parse_args() if not os.path.exists(args.log): os.mkdir(args.log) dist.init_process_group(backend=args.backend) if dist.get_rank() == 0: with open(os.path.join(args.log, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) np.random.seed(args.seed) torch.manual_seed(args.seed) args.cuda = args.local_rank device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda)
args=(rank,)) process.start() return process def _run(self, rank): self.rank = rank try: dist.init_process_group(backend=BACKEND) except RuntimeError as e: if 'recompile' in e.args[0]: sys.exit(0) # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() sys.exit(0) def _join_and_reduce(self): for p in self.processes: p.join(self.JOIN_TIMEOUT) self.assertEqual(p.exitcode, 0) elif BACKEND == 'mpi': dist.init_process_group(backend='mpi') class TestMPI(TestCase, _DistTestBase): pass if __name__ == '__main__': unittest.main()
world_size = int(os.getenv('SLURM_NTASKS')) if 'OMPI_COMM_WORLD_RANK' in os.environ: global_rank = int(os.getenv('OMPI_COMM_WORLD_RANK')) else: global_rank = int(os.getenv('SLURM_PROCID')) return global_rank, world_size if __name__ == "__main__": # Dont change the following : global_rank, world_size = get_dist_env() hostname = socket.gethostname() # You have run dist.init_process_group to initialize the distributed environment # Always use NCCL as the backend. Gloo performance is pretty bad and MPI is currently # unsupported (for a number of reasons). dist.init_process_group(backend='nccl', rank=global_rank, world_size=world_size) # now run your distributed training code run(global_rank, world_size, hostname) # following is just for demo purposes (and a sanity check) # you don't need this: if global_rank > 0 : print(f'printing NCCL variables on {global_rank} on {hostname}') for key in os.environ: if key.find('NCCL')>-1: print(f'{hostname} {key} : {os.getenv(key)}')
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best)
def main(): global args, best_prec1 args = parser.parse_args() print(args) args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](low_dim=args.low_dim) if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolderInstance( traindir, transforms.Compose([ transforms.RandomResizedCrop(224, scale=(0.2, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) # train_loader = None val_loader = torch.utils.data.DataLoader(datasets.ImageFolderInstance( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define lemniscate and loss function (criterion) ndata = train_dataset.__len__() if args.nce_k > 0: lemniscate = NCEAverage(args.low_dim, ndata, args.nce_k, args.nce_t, args.nce_m).cuda() criterion = NCECriterion(ndata).cuda() else: lemniscate = LinearAverage(args.low_dim, ndata, args.nce_t, args.nce_m).cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # pdb.set_trace() args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint.get('best_prec1', 0) model.load_state_dict(checkpoint['state_dict']) lemniscate = checkpoint['lemniscate'] optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True if args.evaluate: kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t, 1) #recompute memory return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, lemniscate, criterion, optimizer, epoch) # evaluate on validation set prec1 = NN(epoch, model, lemniscate, train_loader, val_loader) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'lemniscate': lemniscate, 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best) # evaluate KNN after last epoch kNN(0, model, lemniscate, train_loader, val_loader, 200, args.nce_t)
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) ## model.cuda() else: model = torch.nn.DataParallel(model) ##.cuda() else: ## model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() ##.cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
group=node_handle[node_idx]) for param in model.parameters(): dist.broadcast(param.grad.data, local_root, group=node_handle[node_idx]) optim.step() total_loss += loss.data.item() n_samples += (output.size(0) * data.m) #print('Here ? - 4') return reduce_loss(total_loss, n_samples) dist.init_process_group('mpi') rank = dist.get_rank() wsize = dist.get_world_size() world_size = wsize parser = argparse.ArgumentParser(description='PyTorch Time series forecasting') parser.add_argument('--data', type=str, required=True, help='location of the data file') parser.add_argument('--model', type=str, default='LSTNet', help='') parser.add_argument('--hidCNN', type=int, default=100, help='number of CNN hidden units')