def test_top1_accuracy(): output_1 = torch.tensor([[0, 1], [0, 1], [1, 0], [0, 1], [1, 0]]).reshape(5, 2) output_2 = torch.tensor([1, 1, 0, 1, 0]).reshape(5, 1) target = torch.tensor([0, 1, 0, 0, 1]).reshape(5, 1) acc = TopKAccuracy() expected_score = (2 / 5) * 100 actual_score_1 = acc(None, output_1, target) actual_score_2 = acc(None, output_2, target) assert actual_score_1 == expected_score assert actual_score_2 == expected_score
def test_top3_accuracy(): output_1 = torch.tensor([ [0.2, 0.2, 0.3, 0.1], [0.15, 0.2, 0.05, 0.6], [0.25, 0.3, 0.15, 0.3], [0.3, 0.1, 0.2, 0.2], [0.15, 0.15, 0.2, 0.5], ]).reshape(5, 4) target = torch.tensor([3, 1, 0, 2, 1]).reshape(5, 1) acc = TopKAccuracy(topk=3) expected_score = (3 / 5) * 100 actual_score_1 = acc(output_1, target) assert actual_score_1 == expected_score
def test_update_best_runtime_metric(mocker): tracker = Tracker([TopKAccuracy(5)], 1, 0) # tracker = mocker.patch('mlbench_core.utils.pytorch.helpers.Tracker') is_best, best_metric_name = update_best_runtime_metric(tracker, 10.0, "prec") assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric(tracker, 11.0, "prec") assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric(tracker, 9.0, "prec") assert not is_best assert best_metric_name == "best_prec"
def metrics(): return [TopKAccuracy(topk=1)]
def main(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 use_cuda = True max_batch_per_epoch = None train_epochs = 164 batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file=os.path.join(output_dir, 'mlbench.log'), use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = SSGDWM(model, world_size=world_size, num_coordinates=1, lr=0.1, weight_decay=0) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() optimizer = optimizer.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: controlflow = TrainValidation(model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def test_tracker(): tracker = Tracker([TopKAccuracy(5)], 1, 0) assert tracker is not None
def run(rank, size, run_id): """ Distributed Synchronous SGD Example """ torch.manual_seed(1234) logging.info("Loading Dataset") train_set, bsz = partition_dataset_train() val_set, bsz_val = partition_dataset_val() logging.info("Setting up models and training") model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] loss_func = nn.NLLLoss() goal = task1_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) num_batches = ceil(len(train_set.dataset) / float(bsz)) num_batches_val = ceil(len(val_set.dataset) / float(bsz_val)) tracker.start() logging.info("Starting train loop") for epoch in range(10): tracker.train() epoch_loss = 0.0 for i, (data, target) in enumerate(train_set): tracker.batch_start() optimizer.zero_grad() output = model(data) tracker.record_batch_step('forward') loss = loss_func(output, target) epoch_loss += loss.data.item() tracker.record_batch_step('loss') loss.backward() tracker.record_batch_step('backward') average_gradients(model) optimizer.step() tracker.batch_end() logging.info("Batch: {}, Loss: {}".format(i, loss.item())) tracker.record_loss(epoch_loss, num_batches, log_to_api=True) logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch, epoch_loss / num_batches) metrics, loss = validation_round(val_set, model, loss_func, metrics, "fp32", tracker=tracker, transform_target_type=False, use_cuda=False, max_batches=num_batches_val) record_validation_stats(metrics, loss, tracker=tracker, rank=rank) tracker.epoch_end() if tracker.goal_reached: logging.debug("Goal Reached!") return
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Main logic.""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 20 batch_size = 100 n_features = 2000 l1_coef = 0.0 l2_coef = 0.0000025 # Regularization 1 / train_size ( 1 / 400,000) dtype = "fp32" rank = dist.get_rank() world_size = dist.get_world_size() lr = 4 scaled_lr = lr * min(16, world_size) by_layer = False agg_grad = False # According to paper, we aggregate weights after update model = LogisticRegression(n_features) # A loss_function for computing the loss loss_function = BCELossRegularized(l1=l1_coef, l2=l2_coef, model=model) if use_cuda: model = model.cuda() loss_function = loss_function.cuda() optimizer = CentralizedSGD( world_size=world_size, model=model, lr=scaled_lr, use_cuda=use_cuda, by_layer=by_layer, agg_grad=agg_grad, ) metrics = [ TopKAccuracy(), # Binary accuracy with threshold 0.5 F1Score(), DiceCoefficient(), ] train_set = LMDBDataset(name="epsilon", data_type="train", root=dataset_dir) val_set = LMDBDataset(name="epsilon", data_type="test", root=dataset_dir) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) num_batches_per_device_train = len(train_loader) scheduler = ReduceLROnPlateau( optimizer.optimizer, factor=0.75, patience=0, verbose=True, threshold_mode="abs", threshold=0.01, min_lr=lr, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task2_time_to_accuracy_light_goal() else: goal = task2_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): # Set tracker and model in training mode model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch( data, target, dtype=dtype, transform_target_dtype=False, use_cuda=use_cuda, ) tracker.record_batch_load() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step(tracker=tracker) metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() # scheduler.batch_step() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() # Perform validation and gather results metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype=dtype, tracker=tracker, transform_target_type=False, use_cuda=use_cuda, max_batches=max_batch_per_epoch, ) # Scheduler per epoch scheduler.step(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False): r"""Main logic.""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = CentralizedSGD(world_size=world_size, model=model, lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=False) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal else: goal = task1_time_to_accuracy_goal tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): train_round(train_loader, model, optimizer, loss_function, metrics, scheduler, 'fp32', schedule_per='epoch', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) is_best = validation_round(val_loader, model, loss_function, metrics, run_id, rank, 'fp32', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") return else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def main(run_id): checkpoint_dir = os.path.join(config['checkpoint_root'], run_id) rank, world_size, _ = initialize_backends( comm_backend=config['comm_backend'], logging_level=config['logging_level'], logging_file=config['logging_file'], use_cuda=config['use_cuda'], seed=config['seed'], ckpt_run_dir=checkpoint_dir) os.makedirs(config['dataset_root'], exist_ok=True) train_set = CIFAR10V1(config['dataset_root'], train=True, download=True) val_set = CIFAR10V1(config['dataset_root'], train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) val_loader = DataLoader(val_set, batch_size=config['batch_size'], shuffle=False, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) model = get_resnet_model('resnet20', 2, 'fp32', num_classes=config['num_classes'], use_cuda=True) if config['use_cuda']: model.cuda() lr = config['lr_per_sample'] * config['batch_size'] optimizer = optim.SGD(model.parameters(), lr=lr, momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=config['nesterov']) scheduler = multistep_learning_rates_with_warmup( optimizer, world_size, lr, config['multisteplr_gamma'], config['multisteplr_milestones'], warmup_duration=config['warmup_duration'], warmup_linear_scaling=config['warmup_linear_scaling'], warmup_lr=lr) loss_function = CrossEntropyLoss() if config['use_cuda']: loss_function.cuda() metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] checkpointer = Checkpointer(checkpoint_dir, rank) controlflow = TrainValidation(model, optimizer, loss_function, metrics, scheduler, config['batch_size'], config['train_epochs'], rank, world_size, run_id, dtype=config['dtype'], checkpoint=checkpointer, use_cuda=config['use_cuda']) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader)
def main(run_id, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20' use_cuda = True train_epochs = 164 initialize_backends( comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() batch_size = 256 // world_size model = ResNetCIFAR( resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR( optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, checkpoint_all=True) if not validation_only: # Aggregation ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size] agg_fn = DecentralizedAggregation( rank=rank, neighbors=ring_neighbors).agg_model controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=None, agg_fn=agg_fn) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 dtype = "fp32" rank = dist.get_rank() world_size = dist.get_world_size() # LR = 0.1 / 256 / sample lr = 0.02 scaled_lr = lr * world_size by_layer = False # Create Model model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) # Create optimizer optimizer = CentralizedSGD( world_size=world_size, model=model, lr=lr, momentum=0.9, weight_decay=1e-4, nesterov=False, use_cuda=use_cuda, by_layer=by_layer, ) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) # Create train/validation sets and loaders train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) # Create a learning rate scheduler for an optimizer scheduler = ReduceLROnPlateauWithWarmup( optimizer.optimizer, warmup_init_lr=lr, scaled_lr=scaled_lr, warmup_epochs=int(math.log(world_size, 2)), # Adaptive warmup period factor=0.5, threshold_mode="abs", threshold=0.01, patience=1, verbose=True, min_lr=lr, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): # Set tracker and model in training mode model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch( data, target, dtype=dtype, transform_target_dtype=False, use_cuda=use_cuda, ) tracker.record_batch_load() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step(tracker=tracker) metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) # Scheduler per epoch tracker.epoch_end() # Perform validation and gather results metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype=dtype, tracker=tracker, transform_target_type=False, use_cuda=use_cuda, max_batches=max_batch_per_epoch, ) scheduler.step(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, by_layer=False, ): r"""Main logic.""" num_parallel_workers = 2 train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() current_device = cuda.current_device() local_model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1).to(current_device) model = DDP(local_model, device_ids=[current_device]) optimizer = SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, ) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): model.train() tracker.train() data_iter = iterate_dataloader(train_loader, dtype="fp32", use_cuda=use_cuda) num_batches_per_device_train = len(train_loader) for batch_idx, (data, target) in enumerate(data_iter): tracker.batch_start() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step() tracker.record_batch_opt_step() metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype="fp32", tracker=tracker, use_cuda=use_cuda, ) scheduler.step() # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def main(run_id): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' use_cuda = True batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir='/checkpoints', delete_existing_ckpts=False) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir='/checkpoints', rank=rank, checkpoint_all=True) controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=164, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=True, max_batch_per_epoch=None) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False)