def set_ppe_manager(self): @self.trainer.on(Events.ITERATION_COMPLETED(every=self.eval_interval)) def report_loss(engine): ppe.reporting.report({'train/loss': engine.state.output}) # manager.extend my_extensions = [ extensions.VariableStatisticsPlot(self.model), extensions.ParameterStatistics(self.model, prefix='model'), # observe_value extensions.observe_lr(optimizer=self.optimizer), extensions.PrintReport([ 'epoch', 'elapsed_time', 'lr', 'train/loss', 'val/loss', 'val/accuracy' ]), extensions.LogReport(trigger=(self.log_interval, 'epoch')), # 'iteration', 'model/fc2.bias/grad/min' extensions.PlotReport(['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PlotReport(['val/accuracy'], 'epoch', filename='accuracy.png'), extensions.ProgressBar(), extensions.snapshot(n_retains=self.retain_num), # (Not Implemented)ExponentialShift # (Not Implemented)InverseShift # (Not Implemented)LinearShift # (Not Implemented)MultistepShift # (Not Implemented)PolynomialShift # (Not Implemented)StepShift # (Not Implemented)WarmupShift # extensions.MicroAverage('loss', 'lr', 'mav'), # (Not Implemented)FailOnNonNumber # (Not Supported)DumpGraph # (Not Supported)unchain_variables ] my_extensions += [ extensions.IgniteEvaluator(self.evaluator, self.valid_loader, self.model, progress_bar=True) ] models = {'main': self.model} optimizers = {'main': self.optimizer} self.ppe_manager = ppe.training.IgniteExtensionsManager( self.trainer, models, optimizers, self.max_epochs, extensions=my_extensions, out_dir=self.out)
def set_extensions(manager, args, model, device, test_loader, optimizer, loss_func, eval_func_dict={}): """set extensions for PPE""" my_extensions = [ # # observe, report ppe_extensions.observe_lr(optimizer=optimizer), # ppe_extensions.ParameterStatistics(model, prefix='model'), # ppe_extensions.VariableStatisticsPlot(model), ppe_extensions.LogReport(), ppe_extensions.PlotReport(['train/loss', 'val/loss'], 'epoch', filename='loss.png'), ppe_extensions.PlotReport([ 'lr', ], 'epoch', filename='lr.png'), ppe_extensions.PrintReport([ 'epoch', 'iteration', 'lr', 'train/loss', 'val/loss', "elapsed_time" ]), # ppe_extensions.ProgressBar(update_interval=100), # # evaluation ( ppe_extensions.Evaluator( test_loader, model, eval_func=lambda data, target: eval_for_batch( args, model, device, data, target, loss_func, eval_func_dict), progress_bar=True), (1, "epoch"), ), # # save model snapshot. (ppe_extensions.snapshot( target=model, filename="snapshot_epoch_{.updater.epoch}.pth"), ppe.training.triggers.MinValueTrigger(key="val/loss", trigger=(1, 'epoch'))), ] # # set extensions to manager for ext in my_extensions: if isinstance(ext, tuple): manager.extend(ext[0], trigger=ext[1]) else: manager.extend(ext) return manager
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--kinematics-pose-csv", type=str, default="./dataset/train/kinematics_pose.csv", ) parser.add_argument("--joint-states-csv", type=str, default="./dataset/train/joint_states.csv") parser.add_argument("--train-val-ratio", type=float, default=0.8) parser.add_argument("--batch-size", type=int, default=10000) parser.add_argument("--epochs", type=int, default=100) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--save-model", action="store_true", default=False) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = IKNet() model.to(device) train_loader, val_loader = get_data_loaders(args) optimizer = optim.Adam(model.parameters(), lr=args.lr) trigger = ppe.training.triggers.EarlyStoppingTrigger( check_trigger=(3, "epoch"), monitor="val/loss") my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix="model"), extensions.VariableStatisticsPlot(model), extensions.Evaluator( val_loader, model, eval_func=lambda data, target: validate(args, model, device, data, target), progress_bar=True, ), extensions.PlotReport(["train/loss", "val/loss"], "epoch", filename="loss.png"), extensions.PrintReport([ "epoch", "iteration", "train/loss", "lr", "val/loss", ]), ] manager = ppe.training.ExtensionsManager( model, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger, ) train(manager, args, model, device, train_loader) if args.save_model: torch.save(model.state_dict(), "iknet.pt")
def objective(trial): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = IKNet(trial) model.to(device) train_loader, val_loader = get_data_loaders(args) optimizer = optim.Adam(model.parameters(), lr=args.lr) trigger = ppe.training.triggers.EarlyStoppingTrigger( check_trigger=(3, "epoch"), monitor="val/loss") my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix="model"), extensions.VariableStatisticsPlot(model), extensions.Evaluator( val_loader, model, eval_func=lambda data, target: validate(args, model, device, data, target), progress_bar=True, ), extensions.PlotReport(["train/loss", "val/loss"], "epoch", filename="loss.png"), extensions.PrintReport([ "epoch", "iteration", "train/loss", "lr", "val/loss", ]), ] manager = ppe.training.ExtensionsManager( model, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger, ) return train(manager, args, model, device, train_loader)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', dest='cuda', action='store_false', default=True, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--snapshot', type=str, default=None, help='path to snapshot file') parser.add_argument('--no-lazy', dest='lazy', action='store_false', default=True, help='do not use lazy modules') args = parser.parse_args() use_cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net(args.lazy) model.to(device) if args.lazy: # You need to run a dummy forward to initialize parameters. # This should be done before passing parameter list to optimizers. # The dummy input can be generated from the loader's first batch # (trim off the data to batch size = 1 for performance). dummy_input = train_loader.dataset[0][0].unsqueeze(0).to(device) model(dummy_input) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # manager.extend(...) also works my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.Evaluator(test_loader, model, eval_func=lambda data, target: test( args, model, device, data, target), progress_bar=True), extensions.PlotReport(['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport([ 'epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/acc' ]), extensions.snapshot(), ] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') manager = ppe.training.ExtensionsManager(model, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) manager.load_state_dict(state) train(manager, args, model, device, train_loader) # Test function is called from the evaluator extension # to get access to the reporter and other facilities # test(args, model, device, test_loader) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', dest='cuda', action='store_false', default=True, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--snapshot', type=str, default=None, help='path to snapshot file') args = parser.parse_args() use_cuda = args.cuda and torch.cuda.is_available() # torch.backends.cudnn.benchmark = False # torch.backends.cudnn.deterministic = True torch.manual_seed(args.seed) comm_world_size, comm_rank, comm_local_rank, device = init_distributed( use_cuda) if comm_rank == 0: print("World size = {}".format(comm_world_size)) print("Rank = {}, Local Rank = {}".format(comm_rank, comm_local_rank)) print("Device = {}".format(device)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} dataset_root = '../data' if comm_local_rank == 0: # download mnist datasets.MNIST(dataset_root, download=True) torch.distributed.barrier() train_dataset = datasets.MNIST( dataset_root, train=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)), ])) test_dataset = datasets.MNIST( dataset_root, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)), ])) train_sampler = torch.utils.data.DistributedSampler[int]( train_dataset, num_replicas=comm_world_size, rank=comm_rank) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) # type: ignore[arg-type] test_dataset_indices = list(range(len(test_dataset))) local_test_dataset_indices = test_dataset_indices[ comm_rank:len(test_dataset_indices):comm_world_size] local_test_dataset = torch.utils.data.Subset( test_dataset, local_test_dataset_indices) test_loader = torch.utils.data.DataLoader( local_test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs) # type: ignore[arg-type] model = ppe.nn.parallel.DistributedDataParallel(Net().to(device)) optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum) # manager.extend(...) also works if comm_local_rank == 0: my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.Evaluator( test_loader, model, eval_func=lambda data, target: test(args, model, device, data, target), progress_bar=True), extensions.PlotReport( ['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport(['epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/acc']), extensions.snapshot(), ] else: my_extensions = [] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') manager = ppe.training.ExtensionsManager( model, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) manager.load_state_dict(state) train(manager, args, model, device, train_loader) # Test function is called from the evaluator extension # to get access to the reporter and other facilities # test(args, model, device, test_loader) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt") # Wait for all processes to finish to complete successfully torch.distributed.barrier()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval): train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() device = 'cpu' if torch.cuda.is_available(): device = 'cuda:0' optimizer = SGD(model.parameters(), lr=lr, momentum=momentum) optimizer.step() trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device) evaluator = create_supervised_evaluator(model, metrics={ 'acc': Accuracy(), 'loss': Loss(F.nll_loss) }, device=device) # manager.extend(...) also works my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.snapshot(), extensions.IgniteEvaluator(evaluator, val_loader, model, progress_bar=True), extensions.PlotReport(['train/loss'], 'epoch', filename='loss.png'), extensions.PrintReport([ 'epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/acc', ]), ] models = {'main': model} optimizers = {'main': optimizer} manager = ppe.training.IgniteExtensionsManager(trainer, models, optimizers, args.epochs, extensions=my_extensions) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) manager.load_state_dict(state) @trainer.on(Events.ITERATION_COMPLETED) def report_loss(engine): ppe.reporting.report({'train/loss': engine.state.output}) trainer.run(train_loader, max_epochs=epochs)
def main(): args = argument_paser() # Fix seed torch.manual_seed(77) # Config gpu use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # Prepare data dataset = MovingMnistDataset() train_index, valid_index = train_test_split(range(len(dataset)), test_size=0.3) train_loader = DataLoader(Subset(dataset, train_index), batch_size=args.batch_size, shuffle=True, **kwargs) valid_loader = DataLoader(Subset(dataset, valid_index), batch_size=args.test_batch_size, shuffle=False, **kwargs) # Prepare model net = ConvLSTMEncoderPredictor(image_size=(64, 64)).to(device) optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.999)) criterion = nn.MSELoss() # manager.extend(...) also works my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(net, prefix='model'), extensions.VariableStatisticsPlot(net), extensions.Evaluator(valid_loader, net, eval_func=lambda data, target: eval_net( net, criterion, data, target, device), progress_bar=True), extensions.PlotReport(['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport( ['epoch', 'iteration', 'train/loss', 'val/loss', 'lr']), extensions.snapshot(), ] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') # Define manager manager = ppe.training.ExtensionsManager(net, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) manager.load_state_dict(state) # Execute train train(manager, net, criterion, train_loader, device) # Test function is called from the evaluator extension # to get access to the reporter and other facilities # test(args, model, device, test_loader) if (args.save_model): torch.save(net.state_dict(), "mnist_cnn.pt")
def set_extensions(manager, args, model, device, test_loader, optimizer, loss_func, eval_func_dict={}): """set extensions for PPE""" my_extensions = [ # # observe, report ppe_extensions.observe_lr(optimizer=optimizer), # ppe_extensions.ParameterStatistics(model, prefix='model'), # ppe_extensions.VariableStatisticsPlot(model), ppe_extensions.LogReport(), ppe_extensions.PlotReport(["train/loss", "val/loss"], "epoch", filename="loss.png"), ppe_extensions.PlotReport(["val/AUROC"], "epoch", filename="auroc.png"), ppe_extensions.PlotReport(["val/F1"], "epoch", filename="F1.png"), ppe_extensions.PlotReport( [ "lr", ], "epoch", filename="lr.png", ), ppe_extensions.PrintReport([ "epoch", "iteration", "lr", "train/loss", "val/loss", "val/AUROC", "val/F1", "elapsed_time", ]), # ppe_extensions.ProgressBar(update_interval=100), # # evaluation ( ppe_extensions.Evaluator( test_loader, model, eval_func=lambda data, target: eval_for_batch( args, model, device, data, target, loss_func, eval_func_dict), progress_bar=True, ), (1, "epoch"), ), # # save model snapshot. (ppe_extensions.snapshot( target=model, filename="snapshot_epoch_{.updater.epoch}.pth")), ] # # set extensions to manager for ext in my_extensions: if isinstance(ext, tuple): manager.extend(ext[0], trigger=ext[1]) else: manager.extend(ext) return manager
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--device', type=str, default='cuda', help='PyTorch device specifier') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--deterministic', action='store_true', default=False, help='make the behavior deterministic') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--snapshot', type=str, default=None, help='path to snapshot file') parser.add_argument('--compare-dump', type=str, default=None, help='directory to save comparer dump to') parser.add_argument('--compare-with', type=str, default=None, help='directory to load comparer dump from') parser.add_argument('--profiler', type=str, default=None, help='output mode for profiler results') args = parser.parse_args() torch.manual_seed(args.seed) numpy.random.seed(args.seed) torch.use_deterministic_algorithms(args.deterministic) use_cuda = args.device.startswith('cuda') kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) # type: ignore[arg-type] test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) # type: ignore[arg-type] model = Net() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.PlotReport(['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport([ 'epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/accuracy' ]), extensions.snapshot(), ] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') profile = None if args.profiler is not None: if args.profiler == 'tensorboard': def callback(prof): torch.profiler.tensorboard_trace_handler( './prof') # type: ignore[attr-defined] elif args.profiler == 'export_chrome_trace': def callback(prof): prof.export_chrome_trace('./prof') elif args.profiler == 'export_stacks': def callback(prof): prof.export_stacks('./prof') elif args.profiler == 'to_pickle': def callback(prof): import pandas as pd df = pd.DataFrame([e.__dict__ for e in prof.events()]) df.to_pickle(f"{trainer.epoch}.pkl") elif args.profiler == 'print': def callback(prof): table = prof.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1) print(table) else: assert False profile = torch.profiler.profile( # type: ignore[attr-defined] activities=[ torch.profiler.ProfilerActivity. CPU, # type: ignore[attr-defined] torch.profiler.ProfilerActivity. CUDA, # type: ignore[attr-defined] ], schedule=torch.profiler.schedule( # type: ignore[attr-defined] wait=0, warmup=0, active=len(train_loader)), on_trace_ready=callback, ) model_with_loss = ModelWithLoss(model) trainer = ppe.engine.create_trainer( model_with_loss, optimizer, args.epochs, device=args.device, extensions=my_extensions, stop_trigger=trigger, evaluator=ppe.engine.create_evaluator( model_with_loss, device=args.device, progress_bar=True, metrics=[ppe.training.metrics.AccuracyMetric('target', 'output')], options={'eval_report_keys': ['loss', 'accuracy']}, ), options={'train_report_keys': ['loss']}, profile=profile, ) ppe.to(model_with_loss, args.device) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) trainer.load_state_dict(state) # Run comparison between devices when requested. if args.compare_dump is not None or args.compare_with is not None: comp = ppe.utils.comparer.Comparer( compare_fn=ppe.utils.comparer.get_default_comparer(rtol=1e-2), outputs=['loss'], ) if args.compare_dump is None: # Compare the engine with an existing dump directory. comp.add_dump('baseline', args.compare_with) comp.add_engine(args.device, trainer, train_loader, test_loader) comp.compare() else: # Create a dump for comparison. assert args.compare_with is None comp.dump(trainer, args.compare_dump, train_loader, test_loader) return trainer.run(train_loader, test_loader) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--snapshot', type=str) parser.add_argument('--snapmodel', type=str) args = parser.parse_args() # get config config = Config() # set seed random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) torch.backends.cudnn.deterministic = True # create model model = utils.create_model(config) device = 'cuda' model.cuda() # define transforms train_trans = transforms.train_transform(resize=(config.input_size_h, config.input_size_w), normalize=config.normalize) val_trans = transforms.eval_transform(resize=(config.input_size_h, config.input_size_w), normalize=config.normalize) # copy config and src if not os.path.exists(os.path.join(config.result, 'src')): os.makedirs(os.path.join(config.result, 'src'), exist_ok=True) for src_file in glob.glob('/work/*.py') + glob.glob('/work/*/*.py'): shutil.copy( src_file, os.path.join(config.result, 'src', os.path.basename(src_file))) # create dataset train_dataset = dataset.Alaska2Dataset( root=config.data, transforms=train_trans, train=True, batchsize=config.batchsize, uniform=config.batch_uniform, ) val_dataset = dataset.Alaska2Dataset(root=config.data, transforms=val_trans, train=False, uniform=False) # create data loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batchsize, num_workers=config.num_workers, shuffle=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=config.batchsize, num_workers=config.num_workers, shuffle=False) # set optimizer # optimizer = torch.optim.AdamW([{'params': model.parameters()}, {'params':metrics_fc.parameters()}], # lr=config.lr # ) #else: optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr) #optimizer = torch.optim.SGD(model.parameters(), # lr=config.lr, # momentum=0.9) # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. if config.fp16: opt_level = 'O1' model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level #keep_batchnorm_fp32=True ) # set scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-8, eps=1e-08) # set criterion #criterion = torch.nn.CrossEntropyLoss() criterion = LabelSmoothing().cuda() num_epochs = config.num_epochs # set manager iters_per_epoch = len(train_loader) manager = ppe.training.ExtensionsManager(model, optimizer, num_epochs, iters_per_epoch=iters_per_epoch, out_dir=config.result, stop_trigger=None) log_interval = (100, 'iteration') #eval_interval = (500, 'iteration') eval_interval = (1, 'epoch') manager.extend(extensions.snapshot(filename='best_snapshot'), trigger=MaxValueTrigger('validation/auc', trigger=eval_interval)) if config.fp16: manager.extend(extensions.snapshot_object(amp, filename='amp.ckpt'), trigger=MaxValueTrigger('validation/auc', trigger=eval_interval)) manager.extend(extensions.LogReport(trigger=log_interval)) manager.extend(extensions.PlotReport(['train/loss', 'validation/loss'], 'epoch', filename='loss.png'), trigger=(1, 'epoch')) manager.extend(extensions.PrintReport([ 'epoch', 'iteration', 'train/loss', 'validation/loss', 'validation/auc', 'lr', 'elapsed_time' ]), trigger=log_interval) manager.extend(extensions.ProgressBar(update_interval=100)) manager.extend(extensions.observe_lr(optimizer=optimizer), trigger=log_interval) #manager.extend(extensions.ParnnameterStatistics(model, prefix='model')) #manager.extend(extensions.VariableStatisticsPlot(model)) manager.extend(ALASKAEvaluator(val_loader, model, eval_hook=None, eval_func=None, loss_criterion=criterion, auc_criterion=auc_eval_func, device=device, scheduler=scheduler, metric_learning=config.metric_learning), trigger=eval_interval) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) manager.load_state_dict(state) #amp = torch.load('amp.ckpt') elif args.snapmodel is not None: print('load snapshot model {}'.format(args.snapmodel)) state = torch.load(args.snapmodel) manager._models['main'].load_state_dict(state['models']['main']) train_func(manager, model, criterion, optimizer, train_loader, device, metric_learning=config.metric_learning, fp16=config.fp16)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--device', type=str, default='cuda', help='PyTorch device specifier') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--snapshot', type=str, default=None, help='path to snapshot file') parser.add_argument('--no-lazy', dest='lazy', action='store_false', default=True, help='do not use lazy modules') args = parser.parse_args() torch.manual_seed(args.seed) use_cuda = args.device.startswith('cuda') kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) model = Net(args.lazy) if args.lazy: # You need to run a dummy forward to initialize parameters. # This should be done before passing parameter list to optimizers. dummy_input = train_loader.dataset[0][0].unsqueeze(0) model(dummy_input) optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum) my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.PlotReport( ['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport(['epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/accuracy']), extensions.snapshot(), ] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') model_with_loss = ModelWithLoss(model) trainer = ppe.engine.create_trainer( model_with_loss, optimizer, args.epochs, device=args.device, extensions=my_extensions, stop_trigger=trigger, evaluator=ppe.engine.create_evaluator( model_with_loss, device=args.device, progress_bar=True, metrics=[ppe.training.metrics.AccuracyMetric('target', 'output')], options={'eval_report_keys': ['loss', 'accuracy']}), options={'train_report_keys': ['loss']} ) if use_cuda: ppe.to(model_with_loss, args.device) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) trainer.load_state_dict(state) trainer.run(train_loader, test_loader) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--device', type=str, default='cuda', help='PyTorch device specifier') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--snapshot', type=str, default=None, help='path to snapshot file') args = parser.parse_args() torch.manual_seed(args.seed) use_cuda = args.device.startswith('cuda') kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) # type: ignore[arg-type] test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=True, collate_fn=ppe.dataloaders.utils.CollateAsDict( names=['data', 'target']), **kwargs) # type: ignore[arg-type] model = Net() optimizer = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum) my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix='model'), extensions.VariableStatisticsPlot(model), extensions.PlotReport( ['train/loss', 'val/loss'], 'epoch', filename='loss.png'), extensions.PrintReport(['epoch', 'iteration', 'train/loss', 'lr', 'model/fc2.bias/grad/min', 'val/loss', 'val/accuracy']), extensions.snapshot(), ] # Custom stop triggers can be added to the manager and # their status accessed through `manager.stop_trigger` trigger = None # trigger = ppe.training.triggers.EarlyStoppingTrigger( # check_trigger=(1, 'epoch'), monitor='val/loss') class ModelWithLoss(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model def forward(self, data, target): output = model(data) if model.training: loss = F.nll_loss(output, target) ppe.reporting.report({'train/loss': loss.item()}) return {'loss': loss} # Final result will be average of averages of the same size test_loss = F.nll_loss(output, target, reduction='mean').item() pred = output.argmax(dim=1, keepdim=True) return {'loss': test_loss, 'output': pred} model_with_loss = ModelWithLoss(model) trainer = ppe.engine.create_trainer( model_with_loss, optimizer, args.epochs, device=args.device, extensions=my_extensions, stop_trigger=trigger, evaluator=ppe.engine.create_evaluator( model_with_loss, device=args.device, progress_bar=True, metrics=[ppe.training.metrics.AccuracyMetric('target', 'output')], options={'eval_report_keys': ['loss', 'accuracy']}), options={'train_report_keys': ['loss']}, logic=CustomLogic(3), ) if use_cuda: ppe.to(model_with_loss, args.device) # Lets load the snapshot if args.snapshot is not None: state = torch.load(args.snapshot) trainer.load_state_dict(state) trainer.run(train_loader, test_loader) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")
def main(): # Defaults arguments parser = argparse.ArgumentParser() parser.add_argument("--data-type", type=str, default="synth") parser.add_argument("--joint-coord", type=str, default="../data/train/synth/") parser.add_argument("--joint-states", type=str, default="../data/train/synth/") parser.add_argument("--robot-path", type=str, default="../data/urdf/mh5l.urdf") parser.add_argument("--robot", type=str, default="mh5l") parser.add_argument("--train-val-ratio", type=float, default=0.8) parser.add_argument("--batch-size", type=int, default=1000) parser.add_argument("--epochs", type=int, default=20) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--save-model", action="store_true", default=True) args, unknown = parser.parse_known_args() # Define torch device based upon GPU availability device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = KineNet() # Assign device to model model.to(device) train_loader, val_loader = get_data_loaders(args) optimizer = optim.Adam(model.parameters(), lr=args.lr) trigger = ppe.training.triggers.EarlyStoppingTrigger( check_trigger=(3, "epoch"), monitor="val/loss") # Configure extensions results = [] my_extensions = [ extensions.LogReport(), extensions.ProgressBar(), extensions.observe_lr(optimizer=optimizer), extensions.ParameterStatistics(model, prefix="model"), extensions.VariableStatisticsPlot(model), extensions.Evaluator(val_loader, model, eval_func=lambda data, target: validate( args, model, device, data, target, results), progress_bar=True), extensions.PlotReport(["train/loss", "val/loss"], "epoch", filename="loss.png"), extensions.PrintReport([ "epoch", "iteration", "train/loss", "lr", "val/loss", ]), ] # Setup pfn extensions manager manager = ppe.training.ExtensionsManager( model, optimizer, args.epochs, extensions=my_extensions, iters_per_epoch=len(train_loader), stop_trigger=trigger, ) train(manager, args, model, device, train_loader, results) plot_loss(pd.DataFrame(results, columns=['S', 'L', 'U', 'B', 'R'])) if args.save_model: torch.save(model.state_dict(), "../model/mh5l_kinenet.pt")