def test_save_and_load_config(self): c1 = Config(self.data) f = os.path.join(self.temp_dir, 'test_save.yaml') c1.save(f) c2 = mlconfig.load(f) self.assertDictEqual(c1.to_dict(), c2.to_dict())
def main(): torch.backends.cudnn.benchmark = True args = parse_args() config = mlconfig.load(args.config) print(config) if args.world_size > 1: init_process(args.backend, args.init_method, args.world_size, args.rank) device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') model = config.model() if distributed_is_initialized(): model.to(device) model = nn.parallel.DistributedDataParallel(model) else: if args.data_parallel: model = nn.DataParallel(model) model.to(device) optimizer = config.optimizer(model.parameters()) scheduler = config.scheduler(optimizer) train_loader = config.dataset(train=True) valid_loader = config.dataset(train=False) trainer = config.trainer(model, optimizer, train_loader, valid_loader, scheduler, device) if args.resume is not None: trainer.resume(args.resume) trainer.fit()
def mlConfigBasedMain(configPath): setLoggingEnabled(True) registerClasses() logMsg("Running", *sys.argv) config = mlconfig.load(configPath) return config
def main(): f = 'config.yaml' config = mlconfig.load(f) print(config) a = config.a() print('a = {}'.format(a)) b = config.b() print('b = {}'.format(b)) c = config.op(a, b) print('c = {}'.format(c))
def main(): torch.backends.cudnn.benchmark = True args = parse_args() config = mlconfig.load(args.config) print(config) if args.world_size > 1: init_process(args.backend, args.init_method, args.world_size, args.rank) device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') model = config.model() if distributed_is_initialized(): model.to(device) model = nn.parallel.DistributedDataParallel(model) else: if args.data_parallel: num_gpus = torch.cuda.device_count() model = nn.DataParallel(model, device_ids=[i for i in range(num_gpus)]) model.to(device) lr0 = 0.0001 * config.dataset.batch_size lf = lambda x: ((1 + math.cos(x * math.pi / config.trainer.num_epochs)) / 2) * (1 - 0.01) + 0.01 # cosine # optimizer = config.optimizer(model.parameters(), lr=lr0) optimizer = config.optimizer(model.parameters(), lr=lr0, betas=(0.937, 0.999)) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) train_loader = config.dataset(train=True) valid_loader = config.dataset(train=False) trainer = config.trainer(model, optimizer, train_loader, valid_loader, scheduler, device) if args.resume is not None: trainer.resume(args.resume) trainer.fit()
def main(): args = parse_args() config = mlconfig.load(args.config) mlflow.log_artifact(args.config) mlflow.log_params(config.flat()) manual_seed() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = config.model().to(device) optimizer = config.optimizer(model.parameters()) scheduler = config.scheduler(optimizer) train_loader = config.dataset(train=True) test_loader = config.dataset(train=False) trainer = config.trainer(device, model, optimizer, scheduler, train_loader, test_loader) if args.resume is not None: trainer.resume(args.resume) trainer.fit()
def train(cfg_path, backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=1, rank=0, no_cuda=False, resume=None, data_parallel=False, epoch_cb=None ): t_start = time.time() torch.backends.cudnn.benchmark = True cfg = mlconfig.load(cfg_path) print(cfg) if world_size > 1: init_process(backend, init_method, world_size, rank) device = torch.device('cuda' if torch.cuda.is_available() and not no_cuda else 'cpu') model = cfg.model() if distributed_is_initialized(): model.to(device) model = nn.parallel.DistributedDataParallel(model) else: if data_parallel: model = nn.DataParallel(model) model.to(device) optimizer = cfg.optimizer(model.parameters()) scheduler = cfg.scheduler(optimizer) train_loader = cfg.dataset(train=True) valid_loader = cfg.dataset(train=False) trainer = cfg.trainer(model, optimizer, train_loader, valid_loader, scheduler, device) if resume is not None: trainer.resume(resume) trainer.fit(epoch_cb) t_end = time.time() - t_start print("Total training time: {:.1f}".format(t_end))
def loadMlConfig(path): registerClasses() return mlconfig.load(path)
if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True device = torch.device('cuda') device_list = [ torch.cuda.get_device_name(i) for i in range(0, torch.cuda.device_count()) ] logger.info("GPU List: %s" % (device_list)) else: device = torch.device('cpu') # Load Exp Configs config_file = os.path.join(args.config_path, args.version) + '.yaml' config = mlconfig.load(config_file) config.set_immutable() for key in config: logger.info("%s: %s" % (key, config[key])) shutil.copyfile(config_file, os.path.join(exp_path, args.version + '.yaml')) def train(starting_epoch, model, optimizer, scheduler, criterion, trainer, evaluator, ENV, data_loader): for epoch in range(starting_epoch, config.epochs): logger.info("") logger.info("=" * 20 + "Training Epoch %d" % (epoch) + "=" * 20) # Train ENV['global_step'] = trainer.train(epoch, model, criterion, optimizer) ENV['train_history'].append(trainer.acc_meters.avg * 100)