def train_loop_per_worker(): import pandas as pd rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size ): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 if isinstance(batch, pd.DataFrame): num_bytes += int( batch.memory_usage(index=True, deep=True).sum() ) elif isinstance(batch, np.ndarray): num_bytes += batch.nbytes else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. num_bytes += sys.getsizeof(batch) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print( "Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s" ) if rank == 0: print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats()))
def with_sampler(loader): # Automatically set the DistributedSampler # If you're using a sampler, the DataLoader shuffle flag must be set to # False. Shuffling is instead determined by the shuffle argument passed # to the DistributedSampler constructor. # If no sampler is passed to the DataLoader constructor, Torch # constructs a default sampler. The default sampler is a RandomSampler # if shuffling is enabled and a SequentialSampler otherwise. DataLoader # does not have a shuffle attribute, so we instead identify whether # shuffling is enabled by checking the default sampler type. shuffle = not isinstance(loader.sampler, SequentialSampler) def seeded_worker_init_fn(worker_init_fn): def wrapper(worker_id): worker_seed = torch.initial_seed() % 2**32 np.random.seed(worker_seed) random.seed(worker_seed) worker_init_fn(worker_id) return wrapper worker_init_fn = loader.worker_init_fn generator = loader.generator if self._seed is not None: worker_init_fn = seeded_worker_init_fn( loader.worker_init_fn) generator = torch.Generator() generator.manual_seed(self._seed) using_default_sampler = isinstance( loader.sampler, (SequentialSampler, RandomSampler)) if not using_default_sampler and train.world_rank() == 0: logger.warn( f"The {loader.sampler.__class__.__name__} will be overwritten " "with a DistributedSampler. You can disable this by setting " "`with_sampler` to False in `prepare_data_loader`.") data_loader_args = { "dataset": loader.dataset, "batch_size": loader.batch_size, "shuffle": False, "num_workers": loader.num_workers, "collate_fn": loader.collate_fn, "pin_memory": loader.pin_memory, "drop_last": loader.drop_last, "timeout": loader.timeout, "worker_init_fn": worker_init_fn, "generator": generator, "sampler": DistributedSampler(loader.dataset, shuffle=shuffle), } return DataLoader(**data_loader_args)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_shard = train.get_dataset_shard("train") validation_dataset = train.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_torch_dataset = train_dataset_shard.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) if train.world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} train.report(**result) results.append(result) train.save_checkpoint(model=model) return results
def train_loop_per_worker(): rank = train.world_rank() data_shard = train.get_dataset_shard("train") start = time.perf_counter() num_epochs, num_batches, num_bytes = 0, 0, 0 batch_delays = [] print("Starting train loop on worker", rank) while time.perf_counter() - start < runtime_seconds: num_epochs += 1 batch_start = time.perf_counter() for batch in data_shard.iter_batches( prefetch_blocks=prefetch_blocks, batch_size=batch_size): batch_delay = time.perf_counter() - batch_start batch_delays.append(batch_delay) num_batches += 1 num_bytes += int( batch.memory_usage(index=True, deep=True).sum()) train.report( bytes_read=num_bytes, num_batches=num_batches, num_epochs=num_epochs, batch_delay=batch_delay, ) batch_start = time.perf_counter() delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") print( "P50/P95/Max batch delay (s)", np.quantile(batch_delays, 0.5), np.quantile(batch_delays, 0.95), np.max(batch_delays), ) print("Num epochs read", num_epochs) print("Num batches read", num_batches) print("Num bytes read", round(num_bytes / (1024 * 1024), 2), "MiB") print("Mean throughput", round(num_bytes / (1024 * 1024) / delta, 2), "MiB/s") if rank == 0: print("Ingest stats from rank=0:\n\n{}".format( data_shard.stats()))
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(train.world_rank()) os.environ["WORLD_SIZE"] = str(train.world_size()) os.environ["LOCAL_RANK"] = str(train.local_rank()) train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = session.get_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: assert isinstance(checkpoint, Checkpoint) checkpoint_dict = checkpoint.to_dict() source_ip = checkpoint_dict[NODE_IP_KEY] source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def train_func(): train.report(rank=train.world_rank())
def train_func(): return train.world_rank()
def train_loop_per_worker(train_loop_config): dataset = train_loop_config["dataset_fn"]() batch_size = train_loop_config["batch_size"] num_epochs = train_loop_config["num_epochs"] data = dataset[0] train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) train_idx = train_idx.split(train_idx.size(0) // train.world_size())[ train.world_rank() ] train_loader = NeighborSampler( data.edge_index, node_idx=train_idx, sizes=[25, 10], batch_size=batch_size, shuffle=True, ) # Disable distributed sampler since the train_loader has already been split above. train_loader = train.torch.prepare_data_loader(train_loader, add_dist_sampler=False) # Do validation on rank 0 worker only. if train.world_rank() == 0: subgraph_loader = NeighborSampler( data.edge_index, node_idx=None, sizes=[-1], batch_size=2048, shuffle=False ) subgraph_loader = train.torch.prepare_data_loader( subgraph_loader, add_dist_sampler=False ) model = SAGE(dataset.num_features, 256, dataset.num_classes) model = train.torch.prepare_model(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) x, y = data.x.to(train.torch.get_device()), data.y.to(train.torch.get_device()) for epoch in range(num_epochs): model.train() # ``batch_size`` is the number of samples in the current batch. # ``n_id`` are the ids of all the nodes used in the computation. This is # needed to pull in the necessary features just for the current batch that is # being trained on. # ``adjs`` is a list of 3 element tuple consisting of ``(edge_index, e_id, # size)`` for each sample in the batch, where ``edge_index``represent the # edges of the sampled subgraph, ``e_id`` are the ids of the edges in the # sample, and ``size`` holds the shape of the subgraph. # See ``torch_geometric.loader.neighbor_sampler.NeighborSampler`` for more info. for batch_size, n_id, adjs in train_loader: optimizer.zero_grad() out = model(x[n_id], adjs) loss = F.nll_loss(out, y[n_id[:batch_size]]) loss.backward() optimizer.step() if train.world_rank() == 0: print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}") train_accuracy = validation_accuracy = test_accuracy = None # Do validation on rank 0 worker only. if train.world_rank() == 0: model.eval() with torch.no_grad(): out = model.module.test(x, subgraph_loader) res = out.argmax(dim=-1) == data.y train_accuracy = int(res[data.train_mask].sum()) / int( data.train_mask.sum() ) validation_accuracy = int(res[data.val_mask].sum()) / int( data.val_mask.sum() ) test_accuracy = int(res[data.test_mask].sum()) / int(data.test_mask.sum()) train.report( train_accuracy=train_accuracy, validation_accuracy=validation_accuracy, test_accuracy=test_accuracy, )
def train_func(config): use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) net = train.torch.prepare_model(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer, num_features) train_acc = train_num_correct / train_num_total print(f"epoch [{epoch + 1}]: training accuracy: " f"{train_num_correct} / {train_num_total} = {train_acc:.4f}") test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print(f"epoch [{epoch + 1}]: testing accuracy: " f"{test_num_correct} / {test_num_total} = {test_acc:.4f}") # Record and log stats. train.report( train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss, ) # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net train.save_checkpoint(model_state_dict=module.state_dict()) if train.world_rank() == 0: return module.cpu()
def train_func(): if (train.world_rank()) == 0: train.save_checkpoint(epoch=0) else: train.report(iter=0)
def train_func(config): is_distributed = config.get("is_distributed", False) use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. if is_distributed: device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") else: device = torch.device( "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. if is_distributed: train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") else: train_dataset_epoch_iterator = config["train_dataset"].iter_epochs() test_dataset = config["test_dataset"] test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) # Setup Tensorboard and MLflow. if is_distributed: # Setup is done through Callback. pass else: writer = SummaryWriter() mlflow.start_run() mlflow_config = config.copy() mlflow_config.pop("test_dataset") mlflow_config.pop("train_dataset") mlflow.log_params(mlflow_config) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) if is_distributed: net = DistributedDataParallel(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer) train_acc = train_num_correct / train_num_total print( f"epoch [{epoch + 1}]: training accuracy: {train_num_correct} / {train_num_total} = {train_acc:.4f}" ) test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print( f"epoch [{epoch + 1}]: testing accuracy: {test_num_correct} / {test_num_total} = {test_acc:.4f}" ) # Record and log stats. if is_distributed: train.report(train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss) else: writer.add_scalar("Accuracy/train", train_acc, epoch) writer.add_scalar("Loss/train", train_running_loss, epoch) writer.add_scalar("Accuracy/test", test_acc, epoch) writer.add_scalar("Loss/test", test_running_loss, epoch) writer.flush() mlflow.log_metrics({ "train_acc": train_acc, "train_loss": train_running_loss, "test_acc": test_acc, "test_loss": test_running_loss }) # Checkpoint model. if is_distributed: import copy model_copy = copy.deepcopy(net.module) train.save_checkpoint( model_state_dict=model_copy.cpu().state_dict()) else: torch.save(net.state_dict(), f"models/model-epoch-{epoch}.torch") # Shutdown Tensorboard and MLflow. if is_distributed: pass else: writer.close() # mlflow.end_run() if is_distributed: if train.world_rank() == 0: return net.module.cpu() else: return None else: return net
def prepare_data_loader( self, data_loader: torch.utils.data.DataLoader, add_dist_sampler: bool = True, move_to_device: bool = True, auto_transfer: bool = True, ) -> torch.utils.data.DataLoader: """Prepares DataLoader for distributed execution. This allows you to use the same exact code regardless of number of workers or the device type being used (CPU, GPU). Args: data_loader (torch.utils.data.DataLoader): The DataLoader to prepare. add_dist_sampler: Whether to add a DistributedSampler to the provided DataLoader. move_to_device: If set, automatically move the data returned by the data loader to the correct device. auto_transfer: If set and device is GPU, another CUDA stream is created to automatically copy data from host (CPU) memory to device (GPU) memory (the default CUDA stream still runs the training procedure). If device is CPU, it will be disabled regardless of the setting. This configuration will be ignored if ``move_to_device`` is False. """ # Backwards compatibility try: world_size = session.get_world_size() world_rank = session.get_world_rank() except Exception: world_size = train.world_size() world_rank = train.world_rank() # Only add Distributed Sampler if the following conditions hold: # 1. More than one training worker is being used. # 2. A DistributedSampler has not already been added by the user. # 3. The dataset is not an IterableDataset. Samplers do not worker with # IterableDatasets. if (world_size > 1 and not isinstance(data_loader.sampler, DistributedSampler) and not (hasattr(data_loader, "dataset") and isinstance(data_loader.dataset, IterableDataset)) and add_dist_sampler): def with_sampler(loader): # Automatically set the DistributedSampler # If you're using a sampler, the DataLoader shuffle flag must be set to # False. Shuffling is instead determined by the shuffle argument passed # to the DistributedSampler constructor. # If no sampler is passed to the DataLoader constructor, Torch # constructs a default sampler. The default sampler is a RandomSampler # if shuffling is enabled and a SequentialSampler otherwise. DataLoader # does not have a shuffle attribute, so we instead identify whether # shuffling is enabled by checking the default sampler type. shuffle = not isinstance(loader.sampler, SequentialSampler) def seeded_worker_init_fn(worker_init_fn): def wrapper(worker_id): worker_seed = torch.initial_seed() % 2**32 np.random.seed(worker_seed) random.seed(worker_seed) worker_init_fn(worker_id) return wrapper worker_init_fn = loader.worker_init_fn generator = loader.generator if self._seed is not None: worker_init_fn = seeded_worker_init_fn( loader.worker_init_fn) generator = torch.Generator() generator.manual_seed(self._seed) using_default_sampler = isinstance( loader.sampler, (SequentialSampler, RandomSampler)) if not using_default_sampler and world_rank == 0: logger.warn( f"The {loader.sampler.__class__.__name__} will be overwritten " "with a DistributedSampler. You can disable this by setting " "`with_sampler` to False in `prepare_data_loader`.") data_loader_args = { "dataset": loader.dataset, "batch_size": loader.batch_size, "shuffle": False, "num_workers": loader.num_workers, "collate_fn": loader.collate_fn, "pin_memory": loader.pin_memory, "drop_last": loader.drop_last, "timeout": loader.timeout, "worker_init_fn": worker_init_fn, "generator": generator, "sampler": DistributedSampler(loader.dataset, shuffle=shuffle), } return DataLoader(**data_loader_args) data_loader = with_sampler(data_loader) if move_to_device: device = self.get_device() data_loader = _WrappedDataLoader(data_loader, device, auto_transfer) return data_loader