def testcase_StepLR_fused(B=3, step_size=2, gamma=0.1, last_epoch=-1): lr = random.choice([torch.rand((B,)), random.random()]) net_array = [_TestNet() for _ in range(B)] net_fused = _TestNet(B=B) optimizer_array = [ optim.Adadelta( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), ) for b in range(B) ] optimizer_fused = get_hfta_optim_for(optim.Adadelta, B=B)( net_fused.parameters(), lr=lr, ) if not isinstance(last_epoch, int) or last_epoch != -1: _init_initial_lr(optimizer_fused, optimizer_array) lr_scheduler_array = [ lr_scheduler.StepLR( optimizer_array[b], index_array_or_return_scalar(step_size, b), gamma=index_array_or_return_scalar(gamma, b), last_epoch=index_array_or_return_scalar(last_epoch, b), ) for b in range(B) ] lr_scheduler_fused = get_hfta_lr_scheduler_for(lr_scheduler.StepLR, B=B)( optimizer_fused, step_size, gamma=gamma, last_epoch=last_epoch, ) _lr_scheduler_testing_procedure(net_fused, net_array, optimizer_fused, optimizer_array, lr_scheduler_fused, lr_scheduler_array)
def testcase_fused( B=3, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0, device=torch.device('cpu'), dtype=torch.float, ): if B > 1 and isinstance(lr, (int, float)): lr = [random.uniform(0.5, 2.0) for _ in range(B)] kwargs = {'device': device, 'dtype': dtype} net_array = [_TestNet(**kwargs) for _ in range(B)] net_fused = _TestNet(B=B, **kwargs) optimizer_array = [ optim.Adadelta( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), rho=index_array_or_return_scalar(rho, b), eps=index_array_or_return_scalar(eps, b), weight_decay=index_array_or_return_scalar(weight_decay, b), ) for b in range(B) ] optimizer_fused = get_hfta_optim_for(optim.Adadelta, B=B)( net_fused.parameters(), lr=lr, rho=rho, eps=eps, weight_decay=weight_decay, ) _optim_testing_procedure(net_fused, net_array, optimizer_fused, optimizer_array)
def testcase_partially_fused( B=3, amsgrad=False, device=torch.device('cpu'), dtype=torch.float, ): kwargs = {'device': device, 'dtype': dtype} net_array = [_TestNet(**kwargs) for _ in range(B)] net_fused = _TestNet(B=B, partially_fused=True, **kwargs) lr = [random.uniform(1e-4, 1e-2) for _ in range(B)] betas = ( [random.uniform(0.8, 0.99) for _ in range(B)], [random.uniform(0.998, 0.9999) for _ in range(B)], ) eps = [random.uniform(1e-9, 1e-7) for _ in range(B)] weight_decay = [random.uniform(0.0, 0.3) for _ in range(B)] optimizer_array = [ optim.Adam( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), betas=( index_array_or_return_scalar(betas[0], b), index_array_or_return_scalar(betas[1], b), ), eps=index_array_or_return_scalar(eps, b), weight_decay=index_array_or_return_scalar(weight_decay, b), amsgrad=amsgrad, ) for b in range(B) ] partially_fused_optimizer = get_hfta_optim_for( optim.Adam, B=B, partially_fused=True, )( net_fused.parameters(), net_fused.unfused_parameters(), lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad, B=B, ) _optim_testing_procedure(net_fused, net_array, partially_fused_optimizer, optimizer_array)
def testcase_StepLR_partially_fused(B=3): net_array = [_TestNet() for _ in range(B)] net_fused = _TestNet(B=B, partially_fused=True) lr = [random.uniform(0.5, 2.0) for _ in range(B)] step_size = [random.randint(2, 8) for _ in range(B)] gamma = [random.uniform(0.1, 0.3) for _ in range(B)] last_epoch = [random.randint(5, 11) for _ in range(B)] optimizer_array = [ optim.Adadelta( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), ) for b in range(B) ] optimizer_partially_fused = get_hfta_optim_for( optim.Adadelta, B=B, partially_fused=True, )( net_fused.parameters(), net_fused.unfused_parameters(), lr=lr, ) _init_initial_lr(optimizer_partially_fused, optimizer_array) lr_scheduler_array = [ lr_scheduler.StepLR( optimizer_array[b], index_array_or_return_scalar(step_size, b), gamma=index_array_or_return_scalar(gamma, b), last_epoch=index_array_or_return_scalar(last_epoch, b), ) for b in range(B) ] lr_scheduler_partially_fused = get_hfta_lr_scheduler_for( lr_scheduler.StepLR, B=B, partially_fused=True, )( optimizer_partially_fused, step_size, gamma=gamma, last_epoch=last_epoch, ) _lr_scheduler_testing_procedure(net_fused, net_array, optimizer_fused, optimizer_array, lr_scheduler_fused, lr_scheduler_array)
def testcase_fused( B=3, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, device=torch.device('cpu'), dtype=torch.float, ): if B > 1 and isinstance(lr, (int, float)): lr = [random.uniform(1e-4, 1e-2) for _ in range(B)] kwargs = {'device': device, 'dtype': dtype} net_array = [_TestNet(**kwargs) for _ in range(B)] net_fused = _TestNet(B=B, **kwargs) optimizer_array = [ optim.Adam( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), betas=( index_array_or_return_scalar(betas[0], b), index_array_or_return_scalar(betas[1], b), ), eps=index_array_or_return_scalar(eps, b), weight_decay=index_array_or_return_scalar(weight_decay, b), amsgrad=amsgrad, ) for b in range(B) ] optimizer_fused = get_hfta_optim_for(optim.Adam, B=B)( net_fused.parameters(), lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad, ) _optim_testing_procedure(net_fused, net_array, optimizer_fused, optimizer_array)
def testcase_partially_fused( B=3, device=torch.device('cpu'), dtype=torch.float, ): kwargs = {'device': device, 'dtype': dtype} net_array = [_TestNet(**kwargs) for _ in range(B)] net_fused = _TestNet(B=B, partially_fused=True, **kwargs) lr = [random.uniform(0.5, 2.0) for _ in range(B)] rho = [random.uniform(0.7, 0.99) for _ in range(B)] eps = [random.uniform(1e-7, 1e-5) for _ in range(B)] weight_decay = [random.uniform(0.0, 0.3) for _ in range(B)] optimizer_array = [ optim.Adadelta( net_array[b].parameters(), lr=index_array_or_return_scalar(lr, b), rho=index_array_or_return_scalar(rho, b), eps=index_array_or_return_scalar(eps, b), weight_decay=index_array_or_return_scalar(weight_decay, b), ) for b in range(B) ] partially_fused_optimizer = get_hfta_optim_for( optim.Adadelta, B=B, partially_fused=True, )( net_fused.parameters(), net_fused.unfused_parameters(), lr=lr, rho=rho, eps=eps, weight_decay=weight_decay, B=B, ) _optim_testing_procedure(net_fused, net_array, partially_fused_optimizer, optimizer_array)
def main(args): print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) track_running_stats = (args.device != 'xla') if args.device == 'cuda': assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True print('Enable cuDNN heuristics!') device = (torch.device(args.device) if args.device in {'cpu', 'cuda'} else xm.xla_device()) if args.device == 'cuda' and args.amp: scaler = amp.GradScaler() else: scaler = None train_loader, test_loader = init_dataloader(args) B = len(args.lr) if args.hfta else 0 model = Resnet18(num_classes=10, B=B, track_running_stats=track_running_stats).to(device) if not args.convergence_test: if B == 0 and args.save_init_model: torch.save(model, args.model_dir) print("model saved! exiting...") exit(0) if args.load_init_model: model.init_load([args.model_dir] * max(1, B)) print('B={} lr={}'.format(B, args.lr)) optimizer = get_hfta_optim_for(optim.Adadelta, B=B)( model.parameters(), lr=args.lr if B > 0 else args.lr[0], ) all_losses = [] epoch_timer = EpochTimer() for epoch in range(args.epochs): epoch_timer.epoch_start(epoch) num_samples_per_epoch, epoch_losses = train(args, model, device, train_loader, optimizer, epoch, B, save_loss=args.convergence_test, scaler=scaler) epoch_timer.epoch_stop(num_samples_per_epoch) if args.convergence_test: all_losses.append(epoch_losses) print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch))) if args.convergence_test: all_losses = torch.cat(all_losses, 0).transpose(0, 1).cpu().numpy() print(all_losses.shape) loss_dict = {} for i, lr in enumerate(args.lr): loss_dict[lr] = all_losses[i] data = pd.DataFrame(loss_dict) data.to_csv(os.path.join(args.outf, "convergence.csv")) else: if args.device == 'xla': print(met.metrics_report()) if args.outf is not None: epoch_timer.to_csv(args.outf) if args.eval: test(model, device, test_loader, B) print('All jobs Finished!')
model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout, B=B).to(device) if args.device == 'cuda' and args.amp: scaler = amp.GradScaler() else: scaler = None # Loop over epochs. optimizer = get_hfta_optim_for(optim.Adadelta, B=B)( model.parameters(), lr=args.lr if B > 0 else args.lr[0], ) scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)( optimizer, step_size=args.step_size if B > 0 else args.step_size[0], gamma=args.gamma if B > 0 else args.gamma[0], ) print("NVIDIA_TF32_OVERRIDE: {}".format( os.environ.get('NVIDIA_TF32_OVERRIDE'))) epoch_timer = EpochTimer() print("start training!") for epoch in range(1, args.epochs + 1): epoch_timer.epoch_start(epoch)
netD.apply(weights_init) if args.netD != '': netD.load_state_dict(torch.load(args.netD)) print(netD) criterion = nn.BCEWithLogitsLoss() if B > 0: fixed_noise = torch.randn(args.batchSize, B, nz, 1, 1, device=device) else: fixed_noise = torch.randn(args.batchSize, nz, 1, 1, device=device) real_label = 1 fake_label = 0 # setup optimizer Adam = get_hfta_optim_for(optim.Adam, B=B) optimizerD = Adam(netD.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) optimizerG = Adam(netG.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) if args.device == 'cuda' and args.amp: scaler = amp.GradScaler() if args.dry_run: args.epochs = 1 def loss_fn(output, label, batch_size): if B > 0: return B * criterion(output.view(batch_size * B), label) else: return criterion(output, label)
blue = lambda x: '\033[94m' + x + '\033[0m' classifier = PointNetDenseCls( k=num_classes, feature_transform=opt.feature_transform, B=B, track_running_stats=(opt.device != 'xla'), ) if opt.model != '': classifier.load_state_dict(torch.load(opt.model)) optimizer = get_hfta_optim_for(optim.Adam, B=B)( classifier.parameters(), lr=opt.lr, betas=(opt.beta1, opt.beta2), weight_decay=opt.weight_decay, ) scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)( optimizer, step_size=opt.step_size, gamma=opt.gamma, ) scaler = amp.GradScaler(enabled=(opt.device == 'cuda' and opt.amp)) classifier.to(device) num_batch = len(dataloader)
def main(args): print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) track_running_stats = (args.device != 'xla') if args.device == 'cuda': assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True print('Enable cuDNN heuristics!') device = (torch.device(args.device) if args.device in {'cpu', 'cuda'} else xm.xla_device()) if args.device == 'cuda' and args.amp: scaler = amp.GradScaler() else: scaler = None train_loader, test_loader = init_dataloader(args) B = len(args.lr) if args.hfta else 0 model_config = generate_partially_fused_config(args.serial_num) print("Model config:", model_config) normal_block = str_to_class(model_config["normal_block"]) serial_block = str_to_class(model_config["serial_block"]) model = PartiallyFusedResNet( model_config["arch"], normal_block, serial_block, num_classes=10, B=B, track_running_stats=track_running_stats, ).to(device) if len(model.unfused_layers) > 0: model.unfused_to(device) optimizer = get_hfta_optim_for(optim.Adadelta, B=B, partially_fused=True)( model.parameters(), model.get_unfused_parameters(), lr=args.lr if B > 0 else args.lr[0], ) else: optimizer = get_hfta_optim_for(optim.Adadelta, B=B)( model.parameters(), lr=args.lr if B > 0 else args.lr[0], ) epoch_timer = EpochTimer() for epoch in range(args.epochs): epoch_timer.epoch_start(epoch) num_samples_per_epoch, _ = train(args, model, device, train_loader, optimizer, epoch, B, scaler=scaler) epoch_timer.epoch_stop(num_samples_per_epoch) print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch))) if args.device == 'xla': print(met.metrics_report()) if args.outf is not None: epoch_timer.to_csv(args.outf) if args.eval: test(model, device, test_loader, B) print('All jobs Finished!')
def main(args): _seeding(args) _mkdir_outf(args) device = _create_device_handle(args) scaler = _create_scaler(args) train_loader, test_loader, num_classes = _create_dataloaders(args) if args.hfta: B = consolidate_hyperparams_and_determine_B( args, ['lr', 'beta1', 'beta2', 'weight_decay', 'gamma', 'step_size'], ) else: B = 0 (args.lr, args.beta1, args.beta2, args.weight_decay, args.gamma, args.step_size) = (args.lr[0], args.beta1[0], args.beta2[0], args.weight_decay[0], args.gamma[0], args.step_size[0]) model = _get_model_constructor(args)( num_classes=num_classes, B=B, track_running_stats=(args.device != 'xla'), ).to(device) criterion = nn.CrossEntropyLoss() optimizer = get_hfta_optim_for(optim.Adam, B=B)( model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)( optimizer, step_size=args.step_size, gamma=args.gamma, ) epoch_timer = EpochTimer() for epoch in range(args.epochs): epoch_timer.epoch_start(epoch) num_samples_done = train(args, model, criterion, optimizer, scaler, device, train_loader, epoch, B) scheduler.step() epoch_timer.epoch_stop(num_samples_done) print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch))) if args.device == 'xla': print(met.metrics_report()) if args.outf is not None: epoch_timer.to_csv(args.outf) if args.eval: acc_top1, acc_top5 = test(args, model, device, test_loader, B) if args.outf is not None: pd.DataFrame({ 'acc:top1': acc_top1, 'acc:top5': acc_top5, }).to_csv(os.path.join(args.outf, 'eval.csv')) return acc_top1, acc_top5
def main(args): blue = lambda x: '\033[94m' + x + '\033[0m' seeding(args.seed) if args.hfta: B = consolidate_hyperparams_and_determine_B( args, ['lr', 'beta1', 'beta2', 'weight_decay', 'gamma', 'step_size'], ) else: B = 0 (args.lr, args.beta1, args.beta2, args.weight_decay, args.gamma, args.step_size) = (args.lr[0], args.beta1[0], args.beta2[0], args.weight_decay[0], args.gamma[0], args.step_size[0]) if args.device == 'cuda': assert torch.cuda.is_available() torch.backends.cudnn.benchmark = True print('Enable cuDNN heuristics!') device = (xm.xla_device() if args.device == 'xla' else torch.device(args.device)) dataset, test_dataset = build_dataset(args) dataloader, testdataloader = build_dataloader(args, dataset, test_dataset) print('len(dataset)={}'.format(len(dataset)), 'len(test_dataset)={}'.format(len(test_dataset))) num_classes = len(dataset.classes) print('classes', num_classes) if args.outf is not None: try: os.makedirs(args.outf) except OSError: pass classifier = PointNetCls( k=num_classes, feature_transform=args.feature_transform, B=B, track_running_stats=(args.device != 'xla'), ) if args.model != '': classifier.load_state_dict(torch.load(args.model)) optimizer = get_hfta_optim_for(optim.Adam, B=B)( classifier.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, ) scheduler = get_hfta_lr_scheduler_for(optim.lr_scheduler.StepLR, B=B)( optimizer, step_size=args.step_size, gamma=args.gamma, ) scaler = amp.GradScaler(enabled=(args.device == 'cuda' and args.amp)) classifier.to(device) num_batch = len(dataloader) def loss_fn(output, label, batch_size, trans_feat): if B > 0: loss = B * F.nll_loss(output.view(B * batch_size, -1), label) else: loss = F.nll_loss(output, label) if args.feature_transform: loss += feature_transform_regularizer(trans_feat) * 0.001 return loss classifier = classifier.train() epoch_timer = EpochTimer() # Training loop for epoch in range(args.epochs): num_samples_per_epoch = 0 epoch_timer.epoch_start(epoch) for i, data in enumerate(dataloader, 0): if i > args.iters_per_epoch: break if args.warmup_data_loading: continue points, target = data target = target[:, 0] points, target = points.to(device), target.to(device) N = points.size(0) if B > 0: points = points.unsqueeze(0).expand(B, -1, -1, -1).contiguous() target = target.repeat(B) optimizer.zero_grad(set_to_none=True) if args.device == 'cuda': with amp.autocast(enabled=args.amp): pred, trans, trans_feat = classifier(points) loss = loss_fn(pred, target, N, trans_feat) scaler.scale(loss).backward() scaler.step(optimizer) else: pred, trans, trans_feat = classifier(points) loss = loss_fn(pred, target, N, trans_feat) loss.backward() if args.device == 'xla': xm.optimizer_step(optimizer, barrier=True) else: optimizer.step() print('[{}: {}/{}] train loss: {}'.format(epoch, i, num_batch, loss.item())) num_samples_per_epoch += N * max(B, 1) scaler.update() scheduler.step() epoch_timer.epoch_stop(num_samples_per_epoch) print('Epoch {} took {} s!'.format(epoch, epoch_timer.epoch_latency(epoch))) if args.device == 'xla' and not args.eval: print(met.metrics_report()) if args.outf is not None: epoch_timer.to_csv(args.outf) if args.eval: # Run validation loop. print("Running validation loop ...") classifier = classifier.eval() with torch.no_grad(): total_correct = torch.zeros(max(B, 1), device=device) total_testset = 0 for data in testdataloader: if args.warmup_data_loading: continue points, target = data target = target[:, 0] points, target = points.to(device), target.to(device) N = points.size(0) if B > 0: points = points.unsqueeze(0).expand(B, -1, -1, -1).contiguous() target = target.repeat(B) pred, _, _ = classifier(points) pred_choice = pred.argmax(-1) correct = pred_choice.eq( target.view(B, N) if B > 0 else target).sum(-1) total_correct.add_(correct) total_testset += N final_accuracy = total_correct / total_testset final_accuracy = final_accuracy.cpu().tolist() if args.outf is not None: pd.DataFrame({ 'acc': final_accuracy }).to_csv(os.path.join(args.outf, 'eval.csv')) # Return test_accuracy return final_accuracy