def main( run_id, dataset_dir, ckpt_run_dir, output_dir, rank, backend, hosts, validation_only=False, gpu=False, light_target=False, ): r"""Main logic.""" with initialize_backends( comm_backend=backend, hosts=hosts, rank=rank, logging_level="INFO", logging_file=os.path.join(output_dir, "mlbench.log"), use_cuda=gpu, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only, ): train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only, use_cuda=gpu, light_target=light_target, )
def main(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, gpu=False, light_target=False): r"""Main logic.""" with initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file=os.path.join(output_dir, 'mlbench.log'), use_cuda=gpu, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only): train_loop(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only, use_cuda=gpu, light_target=light_target)
def main(run_id, output_dir, rank, backend, hosts, use_horovod=False, gpu=False): r"""Main logic.""" with initialize_backends( comm_backend=backend, hosts=hosts, rank=rank, logging_level="INFO", logging_file=os.path.join(output_dir, "mlbench.log"), use_cuda=gpu, seed=42, cudnn_deterministic=False, delete_existing_ckpts=True, ): train_loop(run_id=run_id, use_horovod=use_horovod, gpu=gpu)
def main(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 use_cuda = True max_batch_per_epoch = None train_epochs = 164 batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file=os.path.join(output_dir, 'mlbench.log'), use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = SSGDWM(model, world_size=world_size, num_coordinates=1, lr=0.1, weight_decay=0) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() optimizer = optimizer.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: controlflow = TrainValidation(model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def main(run_id): checkpoint_dir = os.path.join(config['checkpoint_root'], run_id) rank, world_size, _ = initialize_backends( comm_backend=config['comm_backend'], logging_level=config['logging_level'], logging_file=config['logging_file'], use_cuda=config['use_cuda'], seed=config['seed'], ckpt_run_dir=checkpoint_dir) os.makedirs(config['dataset_root'], exist_ok=True) train_set = CIFAR10V1(config['dataset_root'], train=True, download=True) val_set = CIFAR10V1(config['dataset_root'], train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) val_loader = DataLoader(val_set, batch_size=config['batch_size'], shuffle=False, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) model = get_resnet_model('resnet20', 2, 'fp32', num_classes=config['num_classes'], use_cuda=True) if config['use_cuda']: model.cuda() lr = config['lr_per_sample'] * config['batch_size'] optimizer = optim.SGD(model.parameters(), lr=lr, momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=config['nesterov']) scheduler = multistep_learning_rates_with_warmup( optimizer, world_size, lr, config['multisteplr_gamma'], config['multisteplr_milestones'], warmup_duration=config['warmup_duration'], warmup_linear_scaling=config['warmup_linear_scaling'], warmup_lr=lr) loss_function = CrossEntropyLoss() if config['use_cuda']: loss_function.cuda() metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] checkpointer = Checkpointer(checkpoint_dir, rank) controlflow = TrainValidation(model, optimizer, loss_function, metrics, scheduler, config['batch_size'], config['train_epochs'], rank, world_size, run_id, dtype=config['dtype'], checkpoint=checkpointer, use_cuda=config['use_cuda']) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader)
def main(run_id, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20' use_cuda = True train_epochs = 164 initialize_backends( comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() batch_size = 256 // world_size model = ResNetCIFAR( resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR( optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, checkpoint_all=True) if not validation_only: # Aggregation ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size] agg_fn = DecentralizedAggregation( rank=rank, neighbors=ring_neighbors).agg_model controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=None, agg_fn=agg_fn) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def main(run_id): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' use_cuda = True batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir='/checkpoints', delete_existing_ckpts=False) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir='/checkpoints', rank=rank, checkpoint_all=True) controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=164, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=True, max_batch_per_epoch=None) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False)