def build_iterator(self, is_train=True): if not self.ipu_options: self.build_ipu_options() self.build_dataset() if is_train: self.is_spec_aug = self.args['train_dataset']['is_spec_aug'] collate_fn = CollateFn(self.vocab.sos_id, self.vocab.eos_id, self.is_spec_aug, self.args['train_dataset']['dtype']) else: self.is_spec_aug = False collate_fn = CollateFn(self.vocab.sos_id, self.vocab.eos_id, self.is_spec_aug, self.args['val_dataset']['dtype']) self.train_iterator = poptorch.DataLoader( self.ipu_options, dataset=self.train_dataset, collate_fn=collate_fn, mode=poptorch.DataLoaderMode.Async, shuffle=True, **self.args['train_iterator'], ) if not self.use_generate: self.val_iterator = poptorch.DataLoader( self.ipu_options, dataset=self.val_dataset, collate_fn=collate_fn, mode=poptorch.DataLoaderMode.Async, shuffle=True, **self.args['val_iterator'], )
def test_reuse_workers(DatasetType): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() data = poptorch.DataLoader(opts, DatasetType(shape, num_tensors), batch_size=1, num_workers=2) data_no_reuse = poptorch.DataLoader(opts, DatasetType(shape, num_tensors), batch_size=1, persistent_workers=False, num_workers=2) loader = poptorch.AsynchronousDataAccessor(data) loader_no_reuse = poptorch.AsynchronousDataAccessor(data_no_reuse) start = None # Workers will be created while fetching the first element # so start the timer after the first element is fetched. num_tensors = 0 for _ in loader_no_reuse: num_tensors += 1 if start is None: start = time.perf_counter() end = time.perf_counter() print(f"First epoch no reuse: {end - start} {num_tensors}") for _ in range(3): start = time.perf_counter() for _ in loader_no_reuse: num_tensors += 1 end = time.perf_counter() print(f"Other epoch no reuse: {end - start} {num_tensors}") start = None # Workers will be created while fetching the first element # so start the timer after the first element is fetched. num_tensors_reuse = 0 for _ in loader: num_tensors_reuse += 1 if start is None: start = time.perf_counter() end = time.perf_counter() print(f"First epoch: {end - start} {num_tensors_reuse}") for _ in range(3): start = time.perf_counter() for _ in loader: num_tensors_reuse += 1 end = time.perf_counter() print(f"Other epoch: {end - start} {num_tensors_reuse}")
def _run_dataset_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, host_id=0, num_hosts=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) opts.Distributed.configureProcessId(host_id, num_hosts) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) offset = host_id * (num_tensors // num_hosts) assert len(data) == num_tensors // (device_iterations * batch_size * replication_factor * num_hosts) for it, d in enumerate(loader): expected = torch.from_numpy( numpy.stack([ numpy.full(shape, offset + i, dtype=numpy.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ])) diff = torch.sum(torch.sum(d - expected)) numpy.testing.assert_array_equal(diff.numpy(), [0.])
def _run_process_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1, num_runs=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleData(), opts) for _ in range(0, num_runs): for it, d in enumerate(loader): out = model(d) expected = torch.stack([ torch.full(shape, i * 2, dtype=torch.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ]) assert torch.equal(expected, out)
def _convert_to_poptorch_loader( self, dataloader: Union[Iterable, DataLoader], opts: 'poptorch.Options') -> Union[Iterable, DataLoader]: skip_keys = ('sampler', 'batch_sampler', 'dataset_kind') attrs = { k: v for k, v in vars(dataloader).items() if not k.startswith("_") } params = set(inspect.signature(dataloader.__init__).parameters) contains_dataset = True if type(dataloader) is not DataLoader: contains_dataset = "dataset" in params params.update(inspect.signature(DataLoader.__init__).parameters) dl_args = { name: attrs[name] for name in params if name in attrs and name not in skip_keys } multiprocessing_context = dataloader.multiprocessing_context dl_args['multiprocessing_context'] = multiprocessing_context if not contains_dataset: dl_args.pop('dataset') # Override to drop last uneven batch, as IPUs does not support uneven inputs. dl_args['drop_last'] = True dataloader = poptorch.DataLoader(**dl_args, options=opts) dataloader.multiprocessing_context = multiprocessing_context return dataloader
def setupTraining(model, args): """ Setup a training run using the CIFAR-10 training dataset. Uses the poptorch.DataLoader so that each training iteration executed on the IPU will incorporate: * (mini-)batch size * device iterations * replica factor * gradient accumulation factor Using poptorch.DataLoaderMode.Async allows loading the dataset on a separate thread. This reduces the host/IPU communication overhead by using the time that the IPU is running to load the next batch on the CPU. """ opts = setupOptions(args, train=True) optimizer = optim.SGD(model.parameters(), lr=args.lr) training_model = poptorch.trainingModel(model, opts, optimizer) dataset = cifar10(args.data_dir, train=True) loader = poptorch.DataLoader(opts, dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=8, mode=poptorch.DataLoaderMode.Async) return training_model, loader
def setupInference(model, args): """ Setup a training run using the CIFAR-10 training dataset. Uses the poptorch.DataLoader so that each training iteration executed on the IPU will incorporate: * (mini-)batch size * device iterations * replica factor * gradient accumulation factor Applying the poptorch.AsynchronousDataAccessor allows loading the dataset on a separate thread. This reduces the host/IPU communication overhead by using the time that the IPU is running to load the next batch on the CPU. """ opts = setupOptions(args, train=False) inference_model = poptorch.inferenceModel(model, opts) dataset = cifar10(args.data_dir, train=False) loader = poptorch.DataLoader(opts, dataset, batch_size=args.test_batch_size, shuffle=True, drop_last=True, num_workers=8) loader = poptorch.AsynchronousDataAccessor(loader) return inference_model, loader
def _run_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) assert len(data) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(CheckOrderModel(), opts) for it, d in enumerate(data): expected = torch.from_numpy( numpy.stack([ numpy.full(shape, i, dtype=numpy.float32) for i in range(data.combinedBatchSize * it, data.combinedBatchSize * (it + 1)) ])) diff = torch.sum(model(d, expected)) numpy.testing.assert_array_equal(diff.numpy(), [0.])
def get_dataloader(batch_size, opts, num_iterations, synthetic=False): """ A factory method to create a dataload responsible for sending data to the IPU device. This build the appropriate dataset, whether real or synthetic, and wraps it in a dataloader. """ dataset_size = batch_size * \ opts.device_iterations * \ opts.replication_factor * \ num_iterations if synthetic: dataset = SynthDataset(size=dataset_size) else: transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), ]) dataset = SampleDataset(img_dir='./images', transform=transform, size=dataset_size) dataloader = poptorch.DataLoader(opts, dataset, batch_size=batch_size, shuffle=False, drop_last=True) return dataloader
def train_dataloader(self): dataloader = super().train_dataloader() # save to instance to compare the reference later self.poptorch_dataloader = poptorch.DataLoader(model_options, dataloader.dataset, drop_last=True) return self.poptorch_dataloader
def test_broken_dataset(): num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, BrokenDataset(num_tensors), batch_size=1, num_workers=32) with pytest.raises(RuntimeError, match="worker thread failed to start"): poptorch.AsynchronousDataAccessor(data)
def _convert_to_poptorch_loader( self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None ) -> "poptorch.DataLoader": # use full path to avoid circular imports dl_kwargs = pl.trainer.trainer.TrainerDataLoadingMixin._get_dataloader_init_kwargs(dataloader, sampler) # Override to drop last uneven batch, as IPUs does not support uneven inputs. dl_kwargs["drop_last"] = True opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts dataloader = poptorch.DataLoader(**dl_kwargs, options=opts) return dataloader
def _convert_to_poptorch_loader( self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None) -> "poptorch.DataLoader": if isinstance(dataloader, poptorch.DataLoader): # the user is returning the `poptorch.DataLoader` directly, don't change anything. return dataloader dl_kwargs = _get_dataloader_init_kwargs(dataloader, sampler) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts dataloader = poptorch.DataLoader(opts, **dl_kwargs) return dataloader
def test_len(): shape = [2, 3] num_tensors = 10 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=None, drop_last=False, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) with pytest.raises(TypeError, match="'IncrementIterableDataset' has no len()"): len(loader) data = poptorch.DataLoader(opts, IncrementIterableDatasetWithLen( shape, num_tensors), batch_size=None, drop_last=False, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) len(loader)
def test_single_epoch(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=32) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors for _, _ in enumerate(loader): continue
def test_interrupt_async_loader(): """Make sure the worker processes are stopped cleanly even when the end of the dataset is not reached.""" shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementDataset(shape, num_tensors), batch_size=1, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors for _, _ in enumerate(loader): break
def test_iterable_dataloader(): shape = [2, 3] num_tensors = 100 opts = poptorch.Options() data = poptorch.DataLoader(opts, IncrementIterableDataset(shape, num_tensors), batch_size=1, num_workers=1) loader = poptorch.AsynchronousDataAccessor(data) for _, t in enumerate(loader): assert t.shape == torch.Size([1, 2, 3]) continue # Make sure it works for more than 1 epoch for _, _ in enumerate(loader): continue
def test_random_raw(random_generator, instances): """ Tests whether all the augmentations are unique. """ class DummyDataset(torch.utils.data.Dataset): def __init__(self, size=10, transform=None): self.size = size self.transform = transform def __len__(self): return self.size def __getitem__(self, index): if self.transform == "numpy": augment = np.random.random(1)[0] elif self.transform == "torch": augment = torch.rand(1)[0] elif self.transform == "python": augment = random.random() else: augment = 0.0 return float(index) + augment ds = DummyDataset(transform=random_generator) augments = [] elements = [] for instance_id in range(instances): opts = poptorch.Options() worker_init = _WorkerInit(42, instance_id, 5) if instances > 1: opts.Distributed.configureProcessId(instance_id, instances) opts = opts.randomSeed(42) data_loader = poptorch.DataLoader(opts, ds, batch_size=1, num_workers=5, shuffle=True, worker_init_fn=worker_init) for item in data_loader: frac = item[0].numpy().tolist() % 1 # Get fraction(augmentation) frac = int(10000 * frac) # avoid rounding error augments.append(frac) elements.append(int(item)) assert len(elements) == len(set(elements)) assert len(augments) == len(set(augments)) # all augmentations must be unique
def _run_process_label_test(shape=None, num_tensors=100, batch_size=1, num_workers=0, device_iterations=1, replication_factor=1): shape = shape or [2, 3] opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, IncrementDatasetWithLabels(shape, num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) assert len(loader) == num_tensors // (device_iterations * batch_size * replication_factor) model = poptorch.inferenceModel(DoubleDataLabel(), opts) total = torch.zeros(shape) label_out = torch.zeros(1, dtype=torch.int) for _, (data, label) in enumerate(loader): out, label = model(data, label) total += torch.sum(out, dim=0) label_out += torch.sum(label, dim=0) actual = 0 for i in range(0, num_tensors): actual += i * 2 numpy.testing.assert_array_equal(total[0][0].numpy(), [actual]) numpy.testing.assert_array_equal(label_out[0].item(), [actual])
def process(process_id=0, num_processes=1): # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Replicate the graph across 2 IPUs in each process. opts.replicationFactor(2) # Set the id of the current process and the total number of processes. opts.Distributed.configureProcessId(process_id, num_processes) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader. opts.randomSeed(42) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset( shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{batch_number} {labels[-1]}, {output}, {loss}")
def run_data_loader_example(): model_batch_size = 2 # replication_start # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(100) # Duplicate the model over 4 replicas. opts.replicationFactor(4) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # replication_end # gradient_acc_start # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}") # gradient_acc_end # Not displayed: just to keep the linter happy shape = [3, 2] num_tensors = 100 batch_size = 1 num_workers = 0 device_iterations = 1 replication_factor = 1 # Example starts here: # data_accessor_start opts = poptorch.Options() opts.deviceIterations(device_iterations) opts.replicationFactor(replication_factor) data = poptorch.DataLoader(opts, ExampleDataset(shape=shape, length=num_tensors), batch_size=batch_size, num_workers=num_workers) loader = poptorch.AsynchronousDataAccessor(data) poptorch_model = poptorch.inferenceModel(model, opts) for it, (data, _) in enumerate(loader): out = poptorch_model(data) # data_accessor_end # distributed_execution_start def process(process_id=0, num_processes=1): # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Replicate the graph across 2 IPUs in each process. opts.replicationFactor(2) # Set the id of the current process and the total number of processes. opts.Distributed.configureProcessId(process_id, num_processes) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader. opts.randomSeed(42) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset( shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{batch_number} {labels[-1]}, {output}, {loss}")
# Set the batch size in the conventional sense of being the size that # runs through an operation in the model at any given time model_batch_size = 2 # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(100) # Set up the DataLoader to load that much data at each iteration training_data = poptorch.DataLoader(opts, dataset=ExampleDataset(shape=[3, 2], length=10000), batch_size=model_batch_size, shuffle=True, drop_last=True) model = ExampleModelWithLoss(data_shape=[3, 2], num_classes=2) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2. # "output" and "loss" will be the respective output and loss of the final # batch (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{labels[-1]}, {output}, {loss}")
self._all_labels.append(label) def __len__(self): return self._length def __getitem__(self, index): return self._all_data[index], self._all_labels[index] # simple_ipu_start # Set up the PyTorch DataLoader to load that much data at each iteration opts = poptorch.Options() opts.deviceIterations(10) training_data = poptorch.DataLoader(options=opts, dataset=ExampleDataset(shape=[1], length=20000), batch_size=10, shuffle=True, drop_last=True) model = ExampleModelWithLoss() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts, optimizer=optimizer) momentum_loss = None
opts = poptorch.Options() # Device "step" opts.deviceIterations(20) # How many IPUs to replicate over. opts.replicationFactor(4) opts.randomSeed(42) # Load MNIST normally. training_data = poptorch.DataLoader( opts, torchvision.datasets.MNIST('mnist_data/', train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=training_batch_size, shuffle=True) # Load MNIST normally. val_options = poptorch.Options() validation_data = poptorch.DataLoader( val_options, torchvision.datasets.MNIST('mnist_data/', train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(),
def get_data(configs, model_opts, train=True, async_dataloader=False): """ A factory method to create a dataloader responsible for sending data to the IPU device. This build the appropriate dataset and wraps it in a dataloader. """ if configs.precision.startswith("16."): half_precision = True elif configs.precision.startswith("32."): half_precision = False transform = get_preprocessing_pipeline( train, 224, half_precision, configs.normalization_location == "host") # Determine the size of the small datasets if hasattr(configs, "iterations"): dataset_size = configs.micro_batch_size * \ model_opts.device_iterations * \ model_opts.replication_factor * \ model_opts.Training.gradient_accumulation * \ configs.iterations rebatched_worker_size = None # Select the right dataset if configs.dataset in ["synthetic", "generated"]: if hasattr(configs, "iterations"): dataset = GeneratedDataset((3, 224, 224), size=dataset_size, half_precision=half_precision) else: dataset = GeneratedDataset((3, 224, 224), half_precision=half_precision) elif configs.dataset in ["imagenet1k", "imagenet21k"]: dataset = torchvision.datasets.ImageFolder(os.path.join( configs.dataset_path, "train" if train else "validation"), transform=transform) if train: rebatched_worker_size = 128 elif configs.dataset == "cifar10": dataset = torchvision.datasets.CIFAR10(root=configs.dataset_path, train=train, download=True, transform=transform) if train: rebatched_worker_size = 256 else: raise Exception('Dataset type not recognized: %s' % configs.dataset) mode = poptorch.DataLoaderMode.AsyncRebatched if async_dataloader else poptorch.DataLoaderMode.Sync dataloader = poptorch.DataLoader( model_opts, dataset, batch_size=configs.micro_batch_size if not (isinstance(dataset, IterableDataset)) else None, num_workers=configs.dataloader_workers, shuffle=train and not (isinstance(dataset, IterableDataset)), drop_last=not (isinstance(dataset, IterableDataset)), persistent_workers=True, auto_distributed_partitioning=not isinstance(dataset, IterableDataset), worker_init_fn=None, mode=mode, rebatched_worker_size=rebatched_worker_size, async_options={ 'load_indefinitely': True, "buffer_size": 8 }) return dataloader
def get_data(args, opts, train=True, async_dataloader=False, return_remaining=False, fine_tuning=False): """ A factory method to create a dataload responsible for sending data to the IPU device. This build the appropriate dataset and wraps it in a dataloader. """ logging.info("Loading the data") input_shape = models.model_input_shape(args, train) if args.precision[:3] == "16.": half_precision = True elif args.precision[:3] == "32.": half_precision = False use_bbox_info = getattr(args, "use_bbox_info", False) if args.data in ["real", "imagenet", "cifar10"]: transform = get_preprocessing_pipeline(train, input_shape[-1], half_precision, args.normalization_location == "host", eightbit = args.eight_bit_io, use_bbox_info=use_bbox_info, fine_tuning=fine_tuning) # Determine the size of the small datasets if hasattr(args, "iterations"): dataset_size = args.batch_size * \ opts.device_iterations * \ opts.replication_factor * \ opts.Training.gradient_accumulation * \ args.iterations # Select the right dataset if args.data in ["synthetic", "generated"]: if hasattr(args, "iterations"): dataset = GeneratedDataset(input_shape, size=dataset_size, half_precision=half_precision, eightbit=args.eight_bit_io) else: dataset = GeneratedDataset(input_shape, half_precision=half_precision, eightbit=args.eight_bit_io) elif args.data == "real": data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("images") if hasattr(args, "iterations"): dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size) else: dataset = SampleDataset(img_dir=data_path, transform=transform) elif args.data == "imagenet": assert os.path.exists(args.imagenet_data_path), f"{args.imagenet_data_path} does not exist!" if os.path.exists(os.path.join(args.imagenet_data_path, 'metadata.json')): # WebDataset format dataset = get_webdataset(args, opts, train, transform=transform, use_bbox_info=use_bbox_info) else: data_folder = 'train' if train else 'validation' data_folder = os.path.join(args.imagenet_data_path, data_folder) if os.path.exists(data_folder): # Original ImageNet format bboxes = os.path.join(args.imagenet_data_path, 'imagenet_2012_bounding_boxes.csv') if use_bbox_info and train else None # use bboxes only for training dataset = ImageNetDataset(data_folder, transform=transform, bbox_file=bboxes) else: # TFRecord format dataset = get_tfrecord(args, opts, train, transform=transform, use_bbox_info=use_bbox_info) elif args.data == "cifar10": data_path = Path(__file__).parent.parent.absolute().joinpath("data").joinpath("cifar10") dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform) global_batch_size = args.batch_size * opts.device_iterations * opts.replication_factor * opts.Training.gradient_accumulation if async_dataloader: if global_batch_size == 1: # Avoid rebatch overhead mode = poptorch.DataLoaderMode.Async else: mode = poptorch.DataLoaderMode.AsyncRebatched else: mode = poptorch.DataLoaderMode.Sync worker_initialization = _WorkerInit(args.seed, opts.Distributed.processId, args.dataloader_worker) if hasattr(args, 'seed') else None rebatch_size = getattr(args, "dataloader_rebatch_size", None) rebatch_size = rebatch_size if rebatch_size is not None else min(1024, global_batch_size) // opts.Distributed.numProcesses # Make sure rebatch size is smaller than global batch size rebatch_size = min(rebatch_size, global_batch_size) dataloader = poptorch.DataLoader(opts, dataset, batch_size=args.batch_size, num_workers=args.dataloader_worker, shuffle=train and not(isinstance(dataset, torch.utils.data.IterableDataset)), drop_last= not(return_remaining) and not isinstance(dataset, torch.utils.data.IterableDataset), persistent_workers = True, auto_distributed_partitioning = not isinstance(dataset, torch.utils.data.IterableDataset), worker_init_fn=worker_initialization, mode=mode, rebatched_worker_size=rebatch_size, async_options={'load_indefinitely': True}) if isinstance(dataset, torch.utils.data.IterableDataset): dataloader = DatasetRebatch(dataloader, global_batch_size, len(dataset), not(return_remaining)) return dataloader
def main(): config = transformers.BertConfig(**(vars(parse_bert_args()))) if not config.pretrained_checkpoint: logger( "[warning] --pretrained-checkpoint was not specified; training with uninitialized BERT..." ) # Warnings for configs where embeddings may not fit if config.embedding_serialization_factor == 1: if config.replication_factor == 1: logger( "[warning] With replication_factor == 1 you may need to set " "embedding_serialization_factor > 1 for the model to fit") elif not config.replicated_tensor_sharding: logger( "[warning] With replicated_tensor_sharding=False you may need to set " "embedding_serialization_factor > 1 for the model to fit") samples_per_step = config.batches_per_step * config.micro_batch_size * \ config.gradient_accumulation * config.replication_factor do_training = config.squad_do_training do_validation = config.squad_do_validation opts = get_options(config) opts.outputMode(poptorch.OutputMode.All) logger("Loading Dataset...") datasets = load_dataset("squad") train_dataset = datasets["train"] # Create train features from dataset logger("Tokenizing Train Dataset...") train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=1, remove_columns=train_dataset.column_names, load_from_cache_file=True, ) # Create validation features from dataset logger("Tokenizing Validation Dataset...") validation_features = datasets["validation"].map( prepare_validation_features, batched=True, num_proc=1, remove_columns=datasets["validation"].column_names, load_from_cache_file=True, ) # W&B if config.wandb and (not config.use_popdist or config.popdist_rank == 0): wandb.init(project="torch-bert", settings=wandb.Settings(console="wrap")) wandb_config = vars(config) wandb_config['sdk_version'] = get_sdk_version() wandb.config.update(wandb_config) # Create the model if config.pretrained_checkpoint: model_ipu = PipelinedBertForQuestionAnswering.from_pretrained( config.pretrained_checkpoint, config=config).parallelize().half() else: model_ipu = PipelinedBertForQuestionAnswering( config).parallelize().half() if do_training: train_dl = poptorch.DataLoader( opts, train_dataset, batch_size=config.micro_batch_size, shuffle=True, drop_last=False, collate_fn=PadCollate( samples_per_step, { "input_ids": 0, "attention_mask": 0, "token_type_ids": 0, "start_positions": config.sequence_length, "end_positions": config.sequence_length })) optimizer = get_optimizer(config, model_ipu) model_ipu.train() training_model = poptorch.trainingModel(model_ipu, opts, optimizer) sample_batch = next(iter(train_dl)) logger("Compiling Model...") start_compile = time.perf_counter() training_model.compile(sample_batch["input_ids"], sample_batch["attention_mask"], sample_batch["token_type_ids"], sample_batch["start_positions"], sample_batch["end_positions"]) duration_compilation = time.perf_counter() - start_compile logger(f"Compiled/Loaded model in {duration_compilation} secs") if config.compile_only: sys.exit() # Train scheduler = get_lr_scheduler(optimizer, "linear", config.lr_warmup, config.num_epochs * len(train_dl)) logger("Training...") for epoch in range(config.num_epochs): for step, batch in enumerate(train_dl): start_step = time.perf_counter() outputs = training_model(batch["input_ids"], batch["attention_mask"], batch["token_type_ids"], batch["start_positions"], batch["end_positions"]) scheduler.step() training_model.setOptimizer(optimizer) step_length = time.perf_counter() - start_step step_throughput = samples_per_step / step_length loss = outputs[0].mean().item() logger( f"Epoch: {epoch}, Step:{step}, LR={scheduler.get_last_lr()[0]:.2e}, loss={loss:3.3f}, throughput={step_throughput:3.3f} samples/s" ) if config.wandb: wandb.log({ "Loss": loss, "LR": scheduler.get_last_lr()[0], "Step": step, "Throughput": step_throughput }) training_model.detachFromDevice() if do_validation: config.micro_batch_size = 2 config.batches_per_step = 16 config.gradient_accumulation = 1 config.replication_factor = 1 samples_per_step = config.batches_per_step * config.micro_batch_size * \ config.gradient_accumulation * config.replication_factor opts = get_options(config) opts.outputMode(poptorch.OutputMode.All) val_dl = poptorch.DataLoader(opts, validation_features.remove_columns( ['example_id', 'offset_mapping']), batch_size=config.micro_batch_size, shuffle=False, drop_last=False, collate_fn=default_data_collator) raw_predictions = [[], []] model_ipu.eval() inference_model = poptorch.inferenceModel(model_ipu, opts) sample_batch = next(iter(val_dl)) logger("Compiling Inference Model...") inference_model.compile(sample_batch["input_ids"], sample_batch["attention_mask"], sample_batch["token_type_ids"]) if config.compile_only: sys.exit() logger("Validating...") for step, batch in enumerate(val_dl): start_step = time.perf_counter() outputs = inference_model(batch["input_ids"], batch["attention_mask"], batch["token_type_ids"]) step_length = time.perf_counter() - start_step step_throughput = samples_per_step / step_length raw_predictions[0].append(outputs[0]) raw_predictions[1].append(outputs[1]) logger(f"Step:{step}, throughput={step_throughput} samples/s") raw_predictions[0] = torch.vstack(raw_predictions[0]).float().numpy() raw_predictions[1] = torch.vstack(raw_predictions[1]).float().numpy() final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions) metric = load_metric("squad") formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in final_predictions.items()] references = [{ "id": ex["id"], "answers": ex["answers"] } for ex in datasets["validation"]] metrics = metric.compute(predictions=formatted_predictions, references=references) logger(metrics) if config.wandb: for k, v in metrics.items(): wandb.run.summary[k] = v
# Setup a Poptorch training model training_model = poptorch.trainingModel( model, opts, poptorch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)) # Create a dataset from random data features = torch.randn([10000, 1, 128, 128]) labels = torch.empty([10000], dtype=torch.long).random_(10) dataset = torch.utils.data.TensorDataset(features, labels) print("Dataset size: ", len(dataset)) # Poptorch Dataloader training_data = poptorch.DataLoader(opts, dataset=dataset, batch_size=bs, shuffle=True, drop_last=True, num_workers=num_workers, mode=poptorch.DataLoaderMode.Async, async_options={"early_preload": True}) # Number of steps necessary to consume the whole dataset steps = len(training_data) # Assess asynchronous dataloader throughput on CPU print("Evaluating Dataloader: ", steps, "steps") t0 = time.time() for data, labels in training_data: pass t1 = time.time() total_time = t1 - t0 print("Total execution Time:", total_time, "s")
model = ClassificationModel() # **NOTE**: `self.training` is inherited from `torch.nn.Module` which # initialises its value to `True`. Use `model.eval()` to set it to `False` and # `model.train()` to switch it back to `True`. # ### Prepare training for IPUs # The compilation and execution on the IPU can be controlled using `poptorch. # Options`. These options are used by PopTorch's wrappers such as `poptorch. # DataLoader` and `poptorch.trainingModel`. opts = poptorch.Options() train_dataloader = poptorch.DataLoader(opts, train_dataset, batch_size=16, shuffle=True, num_workers=20) # ### Train the model # We will need another component in order to train our model: an optimizer. Its # role is to apply the computed gradients to the model's weights to optimize # (usually, minimize) the loss function using a specific algorithm. Not all # PyTorch's ops are available at the moment, and for optimizers there are 4 # choices already: SGD, AdamW, LAMB and RMSProp. # We'll use SGD as it's a very popular algorithm. optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # We now introduce the `poptorch.trainingModel` wrapper, which will handle the # training. It takes an instance of a `torch.nn.Module`, such as our custom
def get_data(opts, model_opts, train=True, async_dataloader=False): """ A factory method to create a dataload responsible for sending data to the IPU device. This build the appropriate dataset and wraps it in a dataloader. """ if opts.precision[:3] == "16.": half_precision = True elif opts.precision[:3] == "32.": half_precision = False transform = get_preprocessing_pipeline( train, models.available_models[opts.model]["input_shape"], half_precision) # Determine the size of the small datasets if hasattr(opts, "iterations"): dataset_size = opts.batch_size * \ model_opts.device_iterations * \ model_opts.replication_factor * \ model_opts.Training.gradient_accumulation * \ opts.iterations # Select the right dataset if opts.data == "synthetic": if hasattr(opts, "iterations"): dataset = SynthDataset( models.available_models[opts.model]["input_shape"], size=dataset_size, half_precision=half_precision) else: dataset = SynthDataset( models.available_models[opts.model]["input_shape"], half_precision=half_precision) elif opts.data == "real": data_path = Path(__file__).parent.absolute().joinpath("images") if hasattr(opts, "iterations"): dataset = SampleDataset(img_dir=data_path, transform=transform, size=dataset_size) else: dataset = SampleDataset(img_dir=data_path, transform=transform) elif opts.data == "imagenet": if train: data_folder = 'train' else: data_folder = 'validation' dataset = torchvision.datasets.ImageFolder(os.path.join( opts.imagenet_data_path, data_folder), transform=transform) elif opts.data == "cifar10": data_path = Path(__file__).parent.absolute().joinpath("cifar10") dataset = torchvision.datasets.CIFAR10(root=data_path, train=train, download=True, transform=transform) num_loader_workers = min(32, multiprocessing.cpu_count()) dataloader = poptorch.DataLoader(model_opts, dataset, batch_size=opts.batch_size, num_workers=num_loader_workers, shuffle=train, drop_last=True) if async_dataloader: return poptorch.AsynchronousDataAccessor(dataloader, load_indefinitely=True) else: return dataloader