def adjust_learning_rate(epoch, batch_idx): if epoch < args.warmup_epochs: epoch += float(batch_idx + 1) / len(train_loader) lr_adj = 1. / bps.size() * (epoch * (bps.size() - 1) / args.warmup_epochs + 1) elif epoch < 30: lr_adj = 1. elif epoch < 60: lr_adj = 1e-1 elif epoch < 80: lr_adj = 1e-2 else: lr_adj = 1e-3 for param_group in optimizer.param_groups: param_group['lr'] = args.base_lr * bps.size() * args.batches_per_pushpull * lr_adj
def benchmark(tensor, average, name): if not args.no_wait and hvd.rank() == 0: # let other workers submit allreduce request first time.sleep(0.01) start = time.time() # do not use allreduce_() as it polls every 1ms handle = push_pull_async_inplace(tensor, average, name) while True: if poll(handle): synchronize(handle) break end = time.time() return (end - start) * 1000 log('Number of GPUs: %d' % (hvd.size())) # Benchmark log('Running benchmark...') log('size (Byte) avg. time (ms) std.dev (ms)') for i in range(10): size = 10**i data = torch.rand(size, dtype=torch.float32) if args.cuda: data = data.cuda() # warm up for j in range(args.num_warmup): benchmark(tensor=data, average=True, name=str(i)) # timeit durations = []
args.log_dir) if bps.rank() == 0 else None kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.ImageFolder(args.train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) # BytePS: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=bps.size()` and `rank=bps.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=bps.size(), rank=bps.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) val_dataset = \ datasets.ImageFolder(args.val_dir, transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) val_sampler = torch.utils.data.distributed.DistributedSampler(
loss = F.cross_entropy(output, target) loss.backward() optimizer.step() def log(s, nl=True): if bps.local_rank() != 0: return print(s, end='\n' if nl else '') sys.stdout.flush() log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, bps.size())) # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] enable_profiling = args.profiler & (bps.rank() == 0) with torch.autograd.profiler.profile(enable_profiling, True) as prof: for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = DDP(model, device_ids=[args.gpu], broadcast_buffers=args.broadcast_buffers) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=bps.size(), rank=bps.rank()) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = DistributedSampler(train_dataset, num_replicas=bps.size(), rank=bps.rank()) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = SGD(optimizer_grouped_parameters, lr=args.learning_rate, momentum=0.9) optimizer = bps.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) bps.broadcast_parameters(model.state_dict(), root_rank=0) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (bps.size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
if args.cuda: # BytePS: pin GPU to local rank. torch.cuda.set_device(bps.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % bps.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # BytePS: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=bps.size(), rank=bps.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % bps.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # BytePS: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=bps.size(), rank=bps.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
def benchmark(tensor, average, name): if not args.no_wait and bps.rank() == 0: time.sleep(0.01) start = time.time() handle = push_pull_async_inplace(tensor, average, name) while True: if poll(handle): synchronize(handle) break end = time.time() return (end - start) * 1000 log('Number of GPUs: %d' % (bps.size())) # Benchmark log('Running benchmark...') log('size (Byte) \t avg. time (ms) \t std.dev (ms)') for i in range(8): size = 10**i data = torch.rand(size, dtype=torch.float32) if args.cuda: data = data.cuda() # warm up for j in range(args.num_warmup): benchmark(tensor=data, average=True, name=str(i)) # timeit durations = []
def env_world_size(): return bps.size()
if args.cuda: args.model_device = torch.device('cuda') else: args.model_device = torch.device('cpu') # Initialize Horovod/Cuda myrank = 0 mysize = 1 if args.par == "hvd": hvd.init() myrank = hvd.rank() mysize = hvd.size() elif args.par == "bps": bps.init() myrank = bps.rank() mysize = bps.size() torch.manual_seed(args.seed) if args.cuda: # Horovod & BytePS: pin GPU to local rank. if args.par == "hvd": torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) if args.par == "bps": torch.cuda.set_device(bps.local_rank()) torch.cuda.manual_seed(args.seed) # Model definition model = MortgageNetwork( args.num_features, args.embedding_size, args.hidden_dims,
if args.cuda: # BytePS: pin GPU to local rank. torch.cuda.set_device(bps.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % bps.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # BytePS: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=bps.size(), rank=bps.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % bps.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # BytePS: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=bps.size(), rank=bps.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs)
def build_model(self): """ DataLoader """ if self.fix_aug: print("FIX AUG ON") train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Resize((self.img_size, self.img_size)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) else: train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.Resize((self.img_size + 30, self.img_size + 30)), transforms.RandomCrop(self.img_size), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) test_transform = transforms.Compose([ transforms.Resize((self.img_size, self.img_size)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) self.trainA = ImageFolder(os.path.join(self.dataset_dir, self.dataset, 'trainA'), train_transform, list_mode=self.list_mode) self.trainB = ImageFolder(os.path.join(self.dataset_dir, self.dataset, 'trainB'), train_transform, list_mode=self.list_mode) self.testA = ImageFolder(os.path.join(self.dataset_dir, self.dataset, 'testA'), test_transform, list_mode=self.list_mode) self.testB = ImageFolder(os.path.join(self.dataset_dir, self.dataset, 'testB'), test_transform, list_mode=self.list_mode) trainA_sampler = torch.utils.data.distributed.DistributedSampler( self.trainA, num_replicas=bps.size(), rank=bps.rank()) trainB_sampler = torch.utils.data.distributed.DistributedSampler( self.trainB, num_replicas=bps.size(), rank=bps.rank()) testA_sampler = torch.utils.data.distributed.DistributedSampler( self.testA, num_replicas=bps.size(), rank=bps.rank()) testB_sampler = torch.utils.data.distributed.DistributedSampler( self.testB, num_replicas=bps.size(), rank=bps.rank()) self.trainA_loader = DataLoader(self.trainA, batch_size=self.batch_size, sampler=trainA_sampler, num_workers=1) self.trainB_loader = DataLoader(self.trainB, batch_size=self.batch_size, sampler=trainB_sampler, num_workers=1) self.testA_loader = DataLoader(self.testA, batch_size=1, sampler=testA_sampler) self.testB_loader = DataLoader(self.testB, batch_size=1, sampler=testB_sampler) """ Define Generator, Discriminator """ self.genA2B = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light).to(self.device) self.genB2A = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light).to(self.device) self.disGA = Discriminator(input_nc=3, ndf=self.ch, n_layers=7).to(self.device) self.disGB = Discriminator(input_nc=3, ndf=self.ch, n_layers=7).to(self.device) self.disLA = Discriminator(input_nc=3, ndf=self.ch, n_layers=5).to(self.device) self.disLB = Discriminator(input_nc=3, ndf=self.ch, n_layers=5).to(self.device) """ Define Loss """ self.L1_loss = nn.L1Loss().to(self.device) self.MSE_loss = nn.MSELoss().to(self.device) self.BCE_loss = nn.BCEWithLogitsLoss().to(self.device) gen_named_parameters = [] dis_named_parameters = [] for n, p in (list(self.genA2B.named_parameters(prefix='genA2B')) + list(self.genB2A.named_parameters(prefix='genB2A'))): gen_named_parameters.append((n, p)) for n, p in (list(self.disGA.named_parameters(prefix='disGA')) + list(self.disGB.named_parameters(prefix='disGB')) + list(self.disLA.named_parameters(prefix='disLA')) + list(self.disLB.named_parameters(prefix='disLB'))): dis_named_parameters.append((n, p)) gen_state_dict = OrderedDict( [("genA2B." + k, v) for k, v in self.genA2B.state_dict().items()] + [("genB2A." + k, v) for k, v in self.genB2A.state_dict().items()]) dis_state_dict = OrderedDict( [("disGA." + k, v) for k, v in self.disGA.state_dict().items()] + [("disGB." + k, v) for k, v in self.disGB.state_dict().items()] + [("disLA." + k, v) for k, v in self.disLA.state_dict().items()] + [("disLB." + k, v) for k, v in self.disLB.state_dict().items()]) bps.broadcast_parameters(gen_state_dict, root_rank=0) bps.broadcast_parameters(dis_state_dict, root_rank=0) """ Trainer """ self.G_optim = torch.optim.Adam(itertools.chain( self.genA2B.parameters(), self.genB2A.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) self.D_optim = torch.optim.Adam(itertools.chain( self.disGA.parameters(), self.disGB.parameters(), self.disLA.parameters(), self.disLB.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) named_parameters = [] for n, p in list(self.genA2B.named_parameters()): named_parameters.append(("genA2B." + n, p)) for n, p in list(self.genB2A.named_parameters()): named_parameters.append(("genB2A." + n, p)) self.G_optim = bps.DistributedOptimizer( self.G_optim, named_parameters=gen_named_parameters, compression=bps.Compression.none) self.D_optim = bps.DistributedOptimizer( self.D_optim, named_parameters=dis_named_parameters, compression=bps.Compression.none) self.G_optim._handles.clear() self.D_optim._handles.clear() """ Define Rho clipper to constraint the value of rho in AdaILN and ILN""" self.Rho_clipper = RhoClipper(0, 1)
def __init__(self, args): self.light = args.light if self.light: self.model_name = 'UGATIT_light' else: self.model_name = 'UGATIT' self.result_dir = args.result_dir self.dataset_dir = args.dataset_dir self.dataset = args.dataset self.iteration = args.iteration // bps.size() self.decay_flag = args.decay_flag self.batch_size = args.batch_size self.print_freq = args.print_freq self.save_freq = args.save_freq self.lr = args.lr self.weight_decay = args.weight_decay self.ch = args.ch """ Weight """ self.adv_weight = args.adv_weight self.cycle_weight = args.cycle_weight self.identity_weight = args.identity_weight self.cam_weight = args.cam_weight """ Generator """ self.n_res = args.n_res """ Discriminator """ self.n_dis = args.n_dis self.img_size = args.img_size self.img_ch = args.img_ch self.device = args.device self.benchmark_flag = args.benchmark_flag self.resume = args.resume self.fix_aug = args.fix_aug self.list_mode = args.list_mode if torch.backends.cudnn.enabled and self.benchmark_flag: print('set benchmark !') torch.backends.cudnn.benchmark = True print() print("##### Information #####") print("# light : ", self.light) print("# dataset : ", self.dataset) print("# batch_size : ", self.batch_size) print("# iteration per epoch : ", self.iteration) print() print("##### Generator #####") print("# residual blocks : ", self.n_res) print() print("##### Discriminator #####") print("# discriminator layer : ", self.n_dis) print() print("##### Weight #####") print("# adv_weight : ", self.adv_weight) print("# cycle_weight : ", self.cycle_weight) print("# identity_weight : ", self.identity_weight) print("# cam_weight : ", self.cam_weight)
def train(trn_loader, model, criterion, optimizer, scheduler, epoch): net_meter = NetworkMeter() timer = TimeMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() for i, (input, target) in enumerate(trn_loader): if args.short_epoch and (i > 10): break batch_num = i + 1 timer.batch_start() scheduler.update_lr(epoch, i + 1, len(trn_loader)) # compute output output = model(input) loss = criterion(output, target) should_print = (batch_num % args.print_freq == 0) or (batch_num == len(trn_loader)) # compute gradient and do SGD step if args.fp16: loss = loss * args.loss_scale # zero_grad() and converting fp16/fp32 is handled in optimizer loss.backward() optimizer.step(wait_for_finish=should_print) loss = loss / args.loss_scale else: optimizer.zero_grad() loss.backward() optimizer.step() # Train batch done. Logging results timer.batch_end() if args.local_rank == 0 and should_print: corr1, corr5 = correct(output.data, target, topk=(1, 5)) reduced_loss, batch_total = to_python_float( loss.data), to_python_float(input.size(0)) if args.distributed: # Must keep track of global batch size, since not all machines are guaranteed equal batches at the end of an epoch validate_tensor[0] = batch_total validate_tensor[1] = reduced_loss validate_tensor[2] = corr1 validate_tensor[3] = corr5 batch_total, reduced_loss, corr1, corr5 = bps.push_pull( validate_tensor, average=False, name="validation_tensor") batch_total = batch_total.cpu().numpy() reduced_loss = reduced_loss.cpu().numpy() corr1 = corr1.cpu().numpy() corr5 = corr5.cpu().numpy() reduced_loss = reduced_loss / bps.size() top1acc = to_python_float(corr1) * (100.0 / batch_total) top5acc = to_python_float(corr5) * (100.0 / batch_total) losses.update(reduced_loss, batch_total) top1.update(top1acc, batch_total) top5.update(top5acc, batch_total) tb.log_memory() tb.log_trn_times(timer.batch_time.val, timer.data_time.val, input.size(0)) tb.log_trn_loss(losses.val, top1.val, top5.val) recv_gbit, transmit_gbit = net_meter.update_bandwidth() tb.log("sizes/batch_total", batch_total) tb.log('net/recv_gbit', recv_gbit) tb.log('net/transmit_gbit', transmit_gbit) output = ( f'Epoch: [{epoch}][{batch_num}/{len(trn_loader)}]\t' f'Time {timer.batch_time.val:.3f} ({timer.batch_time.avg:.3f})\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' f'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' f'Acc@5 {top5.val:.3f} ({top5.avg:.3f})\t' f'Data {timer.data_time.val:.3f} ({timer.data_time.avg:.3f})\t' f'BW {recv_gbit:.3f} {transmit_gbit:.3f}') log.verbose(output) tb.update_step_count(batch_total)
def main(): # os.system('shutdown -c') # cancel previous shutdown command log.console(args) tb.log('sizes/world', bps.size()) # need to index validation directory before we start counting the time dataloader.sort_ar(args.data + '/validation') # if args.distributed: # log.console('Distributed initializing process group') torch.cuda.set_device(bps.local_rank()) print(f'cuda device set to {bps.local_rank()}') log.console("cuda initialized (rank=%d)" % (bps.local_rank())) # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=bps.size()) log.console("Distributed: success (%d/%d)" % (bps.rank(), bps.size())) log.console("Loading model (rank=%d)" % (bps.rank())) model = resnet.resnet50(bn0=args.init_bn0).cuda() # reuse the validate tensor global validate_tensor, dist_validate_tensor validate_tensor = torch.tensor([0, 0, 0, 0]).float().cuda() dist_validate_tensor = torch.tensor([0, 0, 0, 0, 0]).float().cuda() if args.fp16: model = network_to_half(model) best_top5 = 93 # only save models over 93%. Otherwise it stops to save every time global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params, name_list = experimental_utils.bnwd_optim_params( model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay ) # start with 0 lr. Scheduler will change this later named_param = [] for p in optim_params: tensors = p['params'] for tensor in tensors: named_param.append(tensor) # create bps_param (tuple) bps_param = [] for i, tensor in enumerate(named_param): name = name_list[i] bps_param.append((name, tensor)) # wrap with byteps optimizer optimizer = DistributedOptimizer( optimizer, named_parameters=bps_param, backward_passes_per_step=args.batches_per_pushpull, half=True, model=model, fp16_params=model_params, fp32_params=master_params, loss_scale=args.loss_scale) if args.resume: checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_top5 = checkpoint['best_top5'] optimizer.load_state_dict(checkpoint['optimizer']) log.console( "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)" ) num_machines = (bps.size() - 1) // 8 + 1 assert (num_machines in schedules) phases = schedules[num_machines] dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [copy.deepcopy(p) for p in phases if 'lr' in p]) # BytePS: broadcast parameters & optimizer state. broadcast_parameters([(name, p.detach()) for name, p in bps_param], root_rank=0) broadcast_optimizer_state(optimizer, root_rank=0) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: log.console('Global Barrier: Syncing machines before training') tensor = torch.tensor([1.0]).float().cuda() barrier_handler = push_pull_async_inplace(tensor, average=True, name="init.barrier") while True: if poll(barrier_handler): synchronize(barrier_handler) break # do broadcast for validate tensor log.console('Broadcasting validate tensor') barrier_handler = push_pull_async_inplace(validate_tensor, average=True, name="validation_tensor") while True: if poll(barrier_handler): synchronize(barrier_handler) break barrier_handler = push_pull_async_inplace( dist_validate_tensor, average=True, name="distributed_validation_tensor") while True: if poll(barrier_handler): synchronize(barrier_handler) break log.event("~~epoch\thours\ttop1\ttop5\n") for epoch in range(args.start_epoch, scheduler.tot_epochs): dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time) time_diff = (datetime.now() - start_time).total_seconds() / 3600.0 log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n') is_best = top5 > best_top5 best_top5 = max(top5, best_top5) if args.local_rank == 0: if is_best: save_checkpoint(epoch, model, best_top5, optimizer, is_best=True, filename='model_best.pth.tar') phase = dm.get_phase(epoch) if phase: save_checkpoint( epoch, model, best_top5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')
kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.ImageFolder(args.train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) # BytePS: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=bps.size()` and `rank=bps.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=bps.size(), rank=bps.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=pushpull_batch_size, sampler=train_sampler, **kwargs) val_dataset = \ datasets.ImageFolder(args.val_dir, transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=bps.size(), rank=bps.rank())