def test_training_attributes(): def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class Model(torch.nn.Module): def __init__(self, attr): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) self.attr = attr def getAttr(self): return self.attr def forward(self, x, target): x += 1 x = poptorch.ipu_print_tensor(x) + self.bias return x, custom_loss(x, target) model = Model("MyAttr") input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) poptorch_model = poptorch.trainingModel(model) poptorch_model(input, target) assert poptorch_model.getAttr() == poptorch_model.attr assert poptorch_model.attr == "MyAttr"
def setupTraining(model, args): """ Setup a training run using the CIFAR-10 training dataset. Uses the poptorch.DataLoader so that each training iteration executed on the IPU will incorporate: * (mini-)batch size * device iterations * replica factor * gradient accumulation factor Using poptorch.DataLoaderMode.Async allows loading the dataset on a separate thread. This reduces the host/IPU communication overhead by using the time that the IPU is running to load the next batch on the CPU. """ opts = setupOptions(args, train=True) optimizer = optim.SGD(model.parameters(), lr=args.lr) training_model = poptorch.trainingModel(model, opts, optimizer) dataset = cifar10(args.data_dir, train=True) loader = poptorch.DataLoader(opts, dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=8, mode=poptorch.DataLoaderMode.Async) return training_model, loader
def pre_dispatch(self) -> None: precision = self.lightning_module.trainer.precision model = LightningIPUModule(self.lightning_module, precision) self.model = model # reset the backup self.poptorch_models = {} # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, weights are synced first. trainer_fn = self.lightning_module.trainer.state.fn if trainer_fn in (TrainerFn.FITTING, TrainerFn.TUNING): # Create model for training and validation which will run on fit training_opts = self.training_opts inference_opts = self.inference_opts optimizer = self.lightning_module.trainer.optimizers[0] model = poptorch.trainingModel(model=model, options=training_opts, optimizer=optimizer) self.poptorch_models[RunningStage.TRAINING] = model if self.lightning_module.trainer.enable_validation: model = poptorch.inferenceModel(model=model, options=inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model elif trainer_fn == TrainerFn.VALIDATING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model elif trainer_fn == TrainerFn.TESTING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.TESTING] = model elif trainer_fn == TrainerFn.PREDICTING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.PREDICTING] = model
def convert_to_ipu_model(model, opts, optimizer): model_opts = create_model_opts(opts) # PopART settings if opts.enable_stochastic_rounding: model_opts.Popart.set("enableStochasticRounding", True) if opts.data == "synthetic": model_opts.Popart.set("syntheticDataMode", 2) if opts.half_partial: model_opts.Popart.set("partialsTypeMatMuls", "half") model_opts.Popart.set("convolutionOptions", {'partialsType': 'half'}) if opts.enable_pipeline_recompute and len(opts.pipeline_splits) > 0: model_opts.Popart.set("autoRecomputation", 3) # disable prefetch to save memory if opts.replicas > 1: model_opts.Popart.set("enablePrefetchDatastreams", False) model_opts.Popart.set("disableGradAccumulationTensorStreams", True) num_stages = len(opts.pipeline_splits)+1 if len(opts.available_memory_proportion) == 1: model_opts.setAvailableMemoryProportion({f'IPU{i}': opts.available_memory_proportion[0] for i in range(num_stages)}) elif len(opts.available_memory_proportion) > 1: model_opts.setAvailableMemoryProportion({f'IPU{i}': amp for i, amp in enumerate(opts.available_memory_proportion)}) # Scale the loss to be the same as bs=1 on a single IPU training. loss_scaling_factor = (1.0 / opts.batch_size) model_with_loss = TrainingModelWithLoss(model, loss_scaling_factor) training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer) return training_model
def pre_dispatch(self) -> None: self._handle_gradient_accumulation_steps() if self.convert_model_to_half: log.info( 'Using full 16bit precision, converting LightningModule weights to FP16.' ) self.model = self.model.half() precision = self.lightning_module.trainer.precision precision = 16 if self.convert_model_to_half else precision model = LightningIPUModule(self.lightning_module, precision) self.model = model # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, weights are synced first. if self.lightning_module.trainer.state.stage is RunningStage.TRAINING: # Create model for training which will run training. optimizer = self.lightning_module.trainer.optimizers[0] model = poptorch.trainingModel(model=model, options=self.training_opts, optimizer=optimizer) self.poptorch_models[RunningStage.TRAINING] = model for x in (RunningStage.VALIDATING, RunningStage.TESTING, RunningStage.PREDICTING): model = poptorch.inferenceModel( model=model, options=self.inference_opts, ) self.poptorch_models[x] = model
def test_training(self): model = Yolov4Head(self.anchors, num_input_channels=32, num_classes=3, stride=8, calculate_loss=True, precision=torch.half) optimizer = torch.optim.SGD( model.parameters(), lr=0.01, momentum=0.9, nesterov=False) model = trainingModel(model.half(), optimizer=optimizer) loss = model(self.input_tensor) assert torch.numel(loss) == 1
def test_2x2_parallel_phased_execution_opts(capfd): poptorch.setLogLevel(1) # Force debug logging N = 3 size = 10 class Model(torch.nn.Module): def __init__(self): super().__init__() self.weights = [] for n in range(N * 6): weight = torch.nn.Parameter(torch.rand(size, size), requires_grad=True) self.register_parameter(f"w{n}", weight) self.weights.append(weight) def forward(self, in0, target=None): phase = 0 weight = iter(self.weights) with poptorch.Block("phase0_ipu0"): ins = torch.split(in0, size) for n in range(N * 3): out = [] for ipu in range(2): x = ins[ipu] with poptorch.Block(f"phase{phase}_ipu{ipu}"): x = torch.matmul(next(weight), x) out.append(F.relu(x)) ins = out[1], out[0] # We want 2 matmuls in the same phase if n % 3 != 1: phase += 1 with poptorch.Block(f"phase{N*2-1}_ipu1"): res = ins[0] + ins[1] if target is None: return res return res, torch.nn.L1Loss(reduction="mean")(res, target) input = torch.rand(size * 2, 1) target = torch.rand(size, 1) model = Model() opts = poptorch.Options() phases = [] # Alternate between 0-2 and 1-3 for n in range(N): phases.append([ poptorch.Stage(f"phase{2*n}_ipu0").ipu(0), poptorch.Stage(f"phase{2*n}_ipu1").ipu(2) ]) phases.append([ poptorch.Stage(f"phase{2*n+1}_ipu0").ipu(1), poptorch.Stage(f"phase{2*n+1}_ipu1").ipu(3) ]) opts.setExecutionStrategy(poptorch.ParallelPhasedExecution(*phases)) poptorch_model = poptorch.trainingModel(model, opts) poptorch_model.compile(input, target) testlog = LogChecker(capfd) testlog.validate_2x2_parallel_phased_execution()
def test_training(self): model = Yolov4P5BackBone(3, nn.ReLU(), calculate_loss=True) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=False) model = trainingModel(model.half(), optimizer=optimizer) _, _, _, loss = model(torch.Tensor(np.random.randn(1, 3, 64, 64))) assert torch.numel(loss) == 1
def convert_to_ipu_model(model, opts, optimizer): model_opts = create_model_opts(opts) model_opts = utils.train_settings(opts, model_opts) replica_count = opts.replicas * (opts.popdist_size if opts.use_popdist else 1) model_with_loss = TrainingModelWithLoss( model, replicas=replica_count, label_smoothing=opts.label_smoothing) training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer) return training_model
def convert_to_ipu_model(model, opts, optimizer): model_opts = create_model_opts(opts) # PopART settings model_opts.Popart.set("enableStochasticRounding", opts.enable_stochastic_rounding) if opts.data == "synthetic": model_opts.Popart.set("syntheticDataMode", int(popart.SyntheticDataMode.RandomNormal)) if opts.half_partial: model_opts.Popart.set("partialsTypeMatMuls", "half") model_opts.Popart.set("convolutionOptions", {'partialsType': 'half'}) if opts.enable_pipeline_recompute and len(opts.pipeline_splits) > 0: model_opts.Popart.set("autoRecomputation", int(popart.RecomputationType.Pipeline)) # disable prefetch to save memory if opts.replicas > 1: model_opts.Popart.set("enablePrefetchDatastreams", False) model_opts.Popart.set("disableGradAccumulationTensorStreams", True) num_stages = len(opts.pipeline_splits) + 1 if len(opts.available_memory_proportion) == 1: model_opts.setAvailableMemoryProportion({ f'IPU{i}': opts.available_memory_proportion[0] for i in range(num_stages) }) elif len(opts.available_memory_proportion) > 1: model_opts.setAvailableMemoryProportion({ f'IPU{i}': amp for i, amp in enumerate(opts.available_memory_proportion) }) if opts.reduction == 'mean': model_opts.Popart.set('accumulationReductionType', int(popart.ReductionType.Mean)) if opts.disable_metrics: # if not interested in accurate metrics, return only subset of the predictions model_opts.anchorMode(poptorch.AnchorMode.Final) else: model_opts.anchorMode(poptorch.AnchorMode.All) # Scale the loss to be the same as bs=1 on a single IPU training. loss_scaling = 1.0 / opts.batch_size if opts.reduction == 'sum' else 1.0 model_with_loss = TrainingModelWithLoss( model, loss_scaling=loss_scaling, label_smoothing=opts.label_smoothing, reduction=opts.reduction) training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer) return training_model
def convert_to_ipu_model(model, args, optimizer): opts = create_training_opts(args) model_with_loss = TrainingModelWithLoss( model, label_smoothing=args.label_smoothing, use_mixup=args.mixup_enabled, use_cutmix=args.cutmix_enabled) training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer) return training_model
def run_model(opts): input_data = torch.ones(4, 1) labels_data = torch.ones(4).long() model = torch.nn.Linear(1, 2, bias=False) model_with_loss = TrainingModelWithLoss(model, 0.1) optimizer = SGD(model_with_loss.parameters(), lr=0.1, momentum=0., use_combined_accum=True) training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer) for _ in range(3): preds, loss, _ = training_model(input_data, labels_data) # return the weights of the model return list(model_with_loss.model.named_parameters())[0][1], loss
def test_training(self): model = Yolov4P5Neck(nn.ReLU(), calculate_loss=True) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=False) model = trainingModel(model.half(), optimizer=optimizer) x = (torch.Tensor(np.random.randn(1, 1024, 2, 2)), torch.Tensor(np.random.randn(1, 512, 4, 4)), torch.Tensor(np.random.randn(1, 256, 8, 8))) _, _, _, loss = model(x) assert torch.numel(loss) == 1
def setup(self, trainer: "pl.Trainer") -> None: # set the `accumulate_grad_batches` property as early as possible self._handle_gradient_accumulation_steps() # patch the dataloader creation function with the custom `poptorch.DataLoader`. # this violates the intended control flow for the plugins, but since this is experimental, we have chosen # to use the simpler solution before adding abstractions to override the `DataLoader` class self._update_dataloader_original = pl.trainer.connectors.data_connector._update_dataloader pl.trainer.connectors.data_connector._update_dataloader = self._convert_to_poptorch_loader super().setup(trainer) # disable the `optimizer_zero_grad` function by setting it to `None`. # this is because the IPU zeros the gradients internally self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad self._disable_zero_grad() model = LightningIPUModule(self.lightning_module, self.precision_plugin.precision) self.model = model # reset the backup self.poptorch_models = {} # Separate models are instantiated for different stages, but they share the same weights on host. # When validation/test models are run, weights are synced first. trainer_fn = self.lightning_module.trainer.state.fn if trainer_fn in (TrainerFn.FITTING, TrainerFn.TUNING): # Create model for training and validation which will run on fit training_opts = self.training_opts inference_opts = self.inference_opts optimizer = self.lightning_module.trainer.optimizers[0] model = poptorch.trainingModel(model=model, options=training_opts, optimizer=optimizer) self.poptorch_models[RunningStage.TRAINING] = model if self.lightning_module.trainer.enable_validation: model = poptorch.inferenceModel(model=model, options=inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model elif trainer_fn == TrainerFn.VALIDATING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model elif trainer_fn == TrainerFn.TESTING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.TESTING] = model elif trainer_fn == TrainerFn.PREDICTING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.PREDICTING] = model
def test_optimizer_groups_none_args(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.Linear(10, 10)) self.loss = torch.nn.CrossEntropyLoss() def forward(self, X, Y, Z, B=None): # pylint: disable=unused-argument fwd = self.model(X) return fwd, self.loss(fwd, Y) model = Model() input = torch.randn(1, 10) target = torch.randint(0, 10, [1]) # Start the optimizer as zero for both groups. poptorch_model = poptorch.trainingModel( model, optimizer=optim.AdamW([{ 'params': model.model[0].parameters(), "lr": 0.0 }, { 'params': model.model[1].parameters(), "lr": 0.0 }], lr=0.1)) poptorch_model.compile(input, target, target) # Parameter is a soft copy by default oddly. weight1 = model.model[0].weight.clone() bias1 = model.model[0].bias.clone() weight2 = model.model[1].weight.clone() bias2 = model.model[1].bias.clone() _, _ = poptorch_model(input, target, target) for _ in range(0, 100): _, _ = poptorch_model(input, target, target) weight1_post, bias1_post = model.model[0].parameters() weight2_post, bias2_post = model.model[1].parameters() # Nothing should have changed. assert torch.equal(weight1, weight1_post) assert torch.equal(weight2, weight2_post) assert torch.equal(bias1, bias1_post) assert torch.equal(bias2, bias2_post)
def test_access_scalar_parameter(use_half): class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x += 1 # It is important to make sure the result of the print is used. x = poptorch.ipu_print_tensor(x) return x + self.bias def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class ExampleModelWithCustomLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target=None): out = self.model(input) if target is not None: return out, custom_loss(out, target) return out model = ExampleModelWithCustomLoss() input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) if use_half: model.half() input = input.half() target = target.half() poptorch_model = poptorch.trainingModel(model) original_bias = str(poptorch_model.model.model.bias) for _ in range(10): poptorch_model(input=input, target=target) updated_bias = str(poptorch_model.model.model.bias) assert original_bias != updated_bias poptorch_model.copyWeightsToHost() # Bias should already be up to date assert updated_bias == str(poptorch_model.model.model.bias)
def test_training_model(conv_mode): model = ClassificationModel(conv_mode) # N, C, H, W x = torch.randn(5, 3, 32, 32) labels = torch.randint(low=1, high=10, size=(5, )) out, loss = model(x, labels) pop_model = poptorch.trainingModel( model, poptorch.Options(), torch.optim.SGD(model.parameters(), lr=0.01)) pop_out, pop_loss = pop_model(x, labels) torch.testing.assert_allclose(out, pop_out) torch.testing.assert_allclose(loss, pop_loss)
def profile(model, args): """ Profile a single training iteration on the IPU using synthetic data """ opts = setupOptions(args) optimizer = optim.SGD(model.parameters(), lr=args.lr) training_model = poptorch.trainingModel(model, opts, optimizer) # Generate a random dataset for profiling device_batch_size = args.batch_size * args.batches_per_step torch.manual_seed(0) data = torch.randn(device_batch_size, 3, 32, 32) labels = torch.randint(0, 10, (device_batch_size, )) _, _ = training_model(data, labels)
def test_optimizer_SGD_nesterov(): torch.manual_seed(42) class Model(torch.nn.Module): def __init__(self): super().__init__() self.model = torch.nn.Sequential(torch.nn.Linear(10, 10), torch.nn.Linear(10, 10)) self.loss = torch.nn.CrossEntropyLoss() def forward(self, X, Y): fwd = self.model(X) return fwd, self.loss(fwd, Y) model = Model() with pytest.raises(ValueError, match="Nesterov momentum is currently not supported"): poptorch.trainingModel(model, optimizer=optim.SGD(model.parameters(), nesterov=True, momentum=0.1, lr=0.001))
def test_explicit_deletion(use_half): class ExampleModel(torch.nn.Module): def __init__(self): super().__init__() self.bias = torch.nn.Parameter(torch.zeros(())) def forward(self, x): x += 1 # It is important to make sure the result of the print is used. x = poptorch.ipu_print_tensor(x) return x + self.bias def custom_loss(output, target): # Mean squared error with a scale loss = output - target loss = loss * loss * 5 return poptorch.identity_loss(loss, reduction="mean") class ExampleModelWithCustomLoss(torch.nn.Module): def __init__(self): super().__init__() self.model = ExampleModel() def forward(self, input, target=None): out = self.model(input) if target is not None: return out, custom_loss(out, target) return out opts = poptorch.Options() # Both models will use the same IPU device. opts.useIpuId(1) model = ExampleModelWithCustomLoss() input = torch.tensor([1.0, 2.0, 3.0]) target = torch.tensor([30.0, 40.0, 50.0]) if use_half: model.half() input = input.half() target = target.half() training_model = poptorch.trainingModel(model, opts) inference_model = poptorch.inferenceModel(model, opts) training_model(input=input, target=target) training_model.destroy() inference_model(input)
def get_model_and_loader(opt: argparse.ArgumentParser, cfg: yacs.config.CfgNode): """Prepares the model and gets a new loader for the model. Parameters: opt: opt object containing options introduced in the command line cfg: yacs object containing the config Returns: model[Detector]: a torch Detector Model loader[DataLoader]: a torch or poptorch DataLoader containing the specified dataset on "cfg" """ # Create model model = Yolov4P5(cfg) if cfg.model.mode == "train": model.train() else: model.eval() # Load weights and fuses some batch normalizations with some convolutions if cfg.model.normalization == 'batch': if opt.weights: print("loading pretrained weights") model = load_and_fuse_pretrained_weights(model, opt) model.optimize_for_inference() # Create the specific ipu options if cfg.model.ipu ipu_opts = ipu_options(opt, cfg, model) if cfg.model.ipu else None # Creates the loader loader = get_loader(opt, cfg, ipu_opts) # Calls the poptorch wrapper and compiles the model if cfg.model.ipu: if cfg.model.mode == "train": model = trainingModel(model, ipu_opts) else: model = inferenceModel(model, ipu_opts) try: img, _, _, _ = next(iter(loader)) model.compile(img) warm_up_iterations = 100 for _ in range(warm_up_iterations): _ = model(img) except Exception as e: print(e.args) exit(0) return model, loader
def _wrap_model(self, type): self.logger.info(f'wrapping model.') if type == 'train': self.torch_model.train() self.training_model = trainingModel( model=self.torch_model, options=self.ipu_options, optimizer=self.optimizer, ) self.logger.info(f'wrapped training model.') elif type == 'val': self.torch_model.eval() self.val_model = inferenceModel(model=self.torch_model, options=self.ipu_options) self.logger.info(f'wrapped inference model.')
def train(model, recompute): input_data = torch.ones(1, 3, 224, 224) labels_data = torch.ones(1).long() opts = poptorch.Options() if recompute: opts._Popart.set("autoRecomputation", int(popart.RecomputationType.Standard)) opts.outputMode(poptorch.OutputMode.All) opts.randomSeed(0) opts.Training.gradientAccumulation(1) opts.Precision.enableStochasticRounding(False) model_with_loss = TrainingModelWithLoss(model) optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0., use_combined_accum=True) training_model = poptorch.trainingModel(model_with_loss, opts, optimizer=optimizer) predictions = [] for _ in range(3): preds, _, _ = training_model(input_data, labels_data) predictions.append(preds) training_model.destroy() return predictions
def test_matmul_training(): N, M, K, C = 100, 9, 7, 5 class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() torch.manual_seed(42) self.linear = torch.nn.Linear(K, K) self.softmax = torch.nn.LogSoftmax(dim=1) self.loss = torch.nn.L1Loss(reduction="mean") def forward(self, x, y, target): x = self.linear(x) x = torch.matmul(x, y) return x, self.loss(x, target) torch.manual_seed(42) model = Net() opts = poptorch.Options() optimizer = optim.SGD(model.parameters(), lr=0.01) torch.manual_seed(42) poptorch_model = poptorch.trainingModel(model, opts, optimizer) x = torch.randn(N, M, K) y = torch.randn(K, K) target = torch.empty(N, M, K, dtype=torch.long).random_(0, C) for _ in range(0, 400): optimizer.zero_grad() poptorch_output, poptorch_loss = poptorch_model(x, y, target) native_output, native_loss = model(x, y, target) native_loss.backward(retain_graph=True) optimizer.step() torch.testing.assert_allclose(poptorch_output, native_output, rtol=1e-02, atol=1e-02) torch.testing.assert_allclose(poptorch_loss, native_loss, rtol=1e-03, atol=1e-03)
def train(model, recompute): input_data = torch.ones(1, 3, 224, 224) labels_data = torch.ones(1).long() model_opts = poptorch.Options() if recompute: model_opts.Popart.set("autoRecomputation", int(popart.RecomputationType.Standard)) model_opts.anchorMode(poptorch.AnchorMode.All) model_opts.randomSeed(0) model_opts.Training.gradientAccumulation(1) model_with_loss = TrainingModelWithLoss(model) optimizer = SGD(model_with_loss.parameters(), lr=0.01, momentum=0.) training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optimizer) predictions = [] for _ in range(3): preds, loss = training_model(input_data, labels_data) predictions.append(preds) return predictions
def test_constant_lrschedule(): """ Test that lr schedule "constant" results in unchanging LR """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) args = """ --config unit_test --lr-schedule constant """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, "constant") poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # Run for some steps for _ in range(5): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should be unchanged assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate
def test_recompute_checkpoint_not_in_ir(): import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --weight-decay 0.0 --recompute-checkpoint-every-layer False """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.recompute_checkpoint_every_layer is False # Execution parameters opts = get_options(config) model = PipelinedBertForPretraining(config).parallelize().half().train() optimizer = get_optimizer(config, model) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) # Compile model datum = get_generated_datum(config) poptorch_model.compile(*datum) ir = json.loads(poptorch_model._debugGetPopartIR()) assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"] ]), ("Popart IR should contain a checkpoint") # Stash: 5 inputs, and 1 stash for transformers on ipu1 exp_num_stash = 5 + 1 assert sum([ "Stash" in node["type"] for node in ir["maingraph"] ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) " "should be stashed") print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
def process(process_id=0, num_processes=1): # Create a poptorch.Options instance to override default options opts = poptorch.Options() # Run a 100 iteration loop on the IPU, fetching a new batch each time opts.deviceIterations(400) # Replicate the graph across 2 IPUs in each process. opts.replicationFactor(2) # Set the id of the current process and the total number of processes. opts.Distributed.configureProcessId(process_id, num_processes) # Accumulate the gradient 8 times before applying it. opts.Training.gradientAccumulation(8) # Optional: All the processes must use the same seed if shuffle=True is used for the DataLoader. opts.randomSeed(42) training_data = poptorch.DataLoader(opts, dataset=ExampleDataset( shape=[3, 2], length=100000), batch_size=model_batch_size, shuffle=True, drop_last=True) # Wrap the model in a PopTorch training wrapper poptorch_model = poptorch.trainingModel(model, options=opts) # Run over the training data with "batch_size" 200 essentially. for batch_number, (data, labels) in enumerate(training_data): # Execute the device with a 100 iteration loop of batchsize 2 across # 4 IPUs. "output" and "loss" will be the respective output and loss of the # final batch of each replica (the default AnchorMode). output, loss = poptorch_model(data, labels) print(f"{batch_number} {labels[-1]}, {output}, {loss}")
def build_model(self): self.get_xs_mask() model_ipu = self.model[0] model_cpu = self.model[1] tensor_ = self.tensor_ tensors = [] for t in tensor_: self.opts.anchorTensor('model.' + t, 'model.' + t) self.opts.anchorTensor('Gradient___model.' + t, 'Gradient___model.' + t) optimizer = poptorch.optim.SGD(model_ipu.parameters(), lr=0.001) training_model = poptorch.trainingModel(model_ipu, options=self.opts, optimizer=optimizer) model_dict = model_cpu.state_dict() model_one_iter = training_model.model.state_dict() pretrained_dict_model_one_iter = { k: v for k, v in model_one_iter.items() if k in model_dict } model_dict.update(pretrained_dict_model_one_iter) model_cpu.load_state_dict(model_dict) self.model_cpu = model_cpu self.training_model = training_model
super().__init__() self.fc = torch.nn.Linear(10, 10) self.loss = torch.nn.MSELoss() def forward(self, x, target=None): fc = self.fc(x) if self.training: return fc, self.loss(fc, target) return fc torch.manual_seed(0) model = ExampleModelWithLoss() # Wrap the model in our PopTorch annotation wrapper. poptorch_model = poptorch.trainingModel(model) # Some dummy inputs. input = torch.randn(10) target = torch.randn(10) # Train on IPU. for i in range(0, 100): # Each call here executes the forward pass, loss calculation, and backward # pass in one step. # Model input and loss function input are provided together. poptorch_out, loss = poptorch_model(input, target) print(f"{i}: {loss}") # Copy the trained weights from the IPU back into the host model. poptorch_model.copyWeightsToHost()