def getOptimizers(): optimizers = [] # SGD sgd0 = popart.SGD({ "lossScaling": (10.0, False), "defaultMomentum": (0.5, False), "defaultVelocityScaling": (0.5, False), "defaultDampening": (0.5, False), "defaultWeightDecay": (0.5, False) }) sgd1 = popart.SGD({ "lossScaling": (0.2, False), "defaultMomentum": (0.2, False), "defaultVelocityScaling": (0.2, False), "defaultDampening": (0.2, False), "defaultWeightDecay": (0.2, False) }) optimizers.append([sgd0, sgd1]) # Adam adam0 = popart.Adam({ "lossScaling": (10.0, False), "defaultLearningRate": (0.5, False), "defaultWeightDecay": (0.5, False), "defaultBeta1": (0.5, False), "defaultBeta2": (0.5, False), "defaultEps": (0.5, False) }) adam1 = popart.Adam({ "lossScaling": (0.2, False), "defaultLearningRate": (0.2, False), "defaultWeightDecay": (0.2, False), "defaultBeta1": (0.2, False), "defaultBeta2": (0.2, False), "defaultEps": (0.2, False) }) optimizers.append([adam0, adam1]) # Adaptive adaptive0 = popart.Adaptive({ "lossScaling": (10.0, False), "defaultLearningRate": (0.5, False), "defaultAlpha": (0.5, False), "defaultMomentum": (0.5, False), "defaultWeightDecay": (0.5, False), "defaultEps": (0.5, False) }) adaptive1 = popart.Adaptive({ "lossScaling": (0.2, False), "defaultLearningRate": (0.2, False), "defaultAlpha": (0.2, False), "defaultMomentum": (0.2, False), "defaultWeightDecay": (0.2, False), "defaultEps": (0.2, False) }) optimizers.append([adaptive0, adaptive1]) return optimizers
def create(self): self.iteration.learning_rate = self.option_values["defaultLearningRate"] if self.opt_type == "SGD": optimizer = popart.SGD(self.optimizer_options) elif self.opt_type == "ADAM": optimizer = popart.Adam(self.optimizer_options, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "ADAM_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.AdamNoBias, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "LAMB": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) elif self.opt_type == "LAMB_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.LambNoBias, accl1_type=self.accl1_type, scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16) weight_decay_tensor_list = [] for stage, tensors in self.tensors.items(): for tensor_id in tensors: params = self.option_values.copy() if self.include_for_weight_decay(tensor_id): params["weightDecay"] = self.weight_decay weight_decay_tensor_list.append(tensor_id) else: params["weightDecay"] = 0 if self.disable_lamb(tensor_id): params["maxWeightNorm"] = 0 for transform in self.transforms: params = transform(tensor_id, params, stage) specific_params = { k: v for k, v in params.items() if k not in self.option_values } if specific_params: p = self._make_tuple_options(specific_params) optimizer.insertSpecific(tensor_id, p) if len(weight_decay_tensor_list) != 0: logger.debug(f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}") return optimizer
def test_replicated_adam_weight_update(tmpdir): optimizer_dict = { "defaultLearningRate": (0.005, True), "defaultBeta1": (0.7, True), "defaultBeta2": (0.8, True), "defaultWeightDecay": (0.1, True), "defaultEps": (1e-6, True), "lossScaling": (10.0, True), } run_model(tmpdir, 'phased.onnx', execution_mode="phased", batch_size=2, num_replicas=1, num_iterations=5, optimizer=popart.Adam(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation) run_model(tmpdir, 'phased_replicated.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, optimizer=popart.Adam(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation) run_model(tmpdir, 'phased_replicated_rws.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, optimizer=popart.Adam(optimizer_dict), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipRtsLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=offChipRtsLocation) phased = onnx.load(str(tmpdir / 'phased.onnx')) phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx')) phased_replicated_rws = onnx.load( str(tmpdir / 'phased_replicated_rws.onnx')) check_model(phased, phased_replicated) check_model(phased, phased_replicated_rws)
def test_adam_mixed_mode_0(tmpdir): #optimizer parameters defaultLearningRate = 0.005 defaultBeta1 = 0.7 defaultBeta2 = 0.8 defaultWeightDecay = 0.1 defaultEps = 1e-6 lossScaling = 10.0 optMaps = [{ 0: popart.Adam({ "defaultLearningRate": (defaultLearningRate, True), "defaultBeta1": (defaultBeta1, True), "defaultBeta2": (defaultBeta2, True), "defaultWeightDecay": (defaultWeightDecay, True), "defaultEps": (defaultEps, True), "lossScaling": (lossScaling, True), }) }] outlining = [False] for i in range(6): optMap = { "defaultLearningRate": (defaultLearningRate, i != 0), "defaultBeta1": (defaultBeta1, i != 1), "defaultBeta2": (defaultBeta2, i != 2), "defaultWeightDecay": (defaultWeightDecay, i != 3), "defaultEps": (defaultEps, i != 4), "lossScaling": (lossScaling, i != 5), } optMaps = optMaps + [{0: popart.Adam(optMap)}] outlining = outlining + [False] for i in range(6): optMap = { "defaultLearningRate": (defaultLearningRate, i != 0), "defaultBeta1": (defaultBeta1, i != 1), "defaultBeta2": (defaultBeta2, i != 2), "defaultWeightDecay": (defaultWeightDecay, i != 3), "defaultEps": (defaultEps, i != 4), "lossScaling": (lossScaling, i != 5), } optMaps = optMaps + [{0: popart.Adam(optMap)}] outlining = outlining + [True] run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float32) run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
def session(splits=1): proto, data, x, loss = model(splits) user_options = { "enableOutlining": False, "enableGradientAccumulation": True, "accumulationFactor": 2, "optimizerStateTensorLocationSettings": popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0) } optimizer = popart.Adam({ "defaultLearningRate": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True) }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=popart.Patterns(), user_options=user_options, skip_execution=False)
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) patterns = popart.Patterns() patterns.enablePattern("TiedGatherPattern", include_patterns) patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, "accumulationAndReplicationReductionType": popart.ReductionType.Mean, "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running } if optim == "Lamb": optimizer = popart.Adam({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True), "lossScaling": (20, True), }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorLocation( popart.TensorStorage.OffChip, popart.ReplicatedTensorSharding.On), 0, 0) user_options["enableReplicatedGraphs"] = True user_options["replicatedGraphCount"] = 2 ipus = 2 else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, True)}) ipus = 1 if train: return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py( proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)
def _get_popart_optimizer(optType, clipNormSettings): if optType == 'sgd': return popart.SGD({"defaultLearningRate": (0.1, True)}, clipNormSettings) elif optType == 'adam': return popart.Adam( { "defaultLearningRate": (0.1, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "lossScaling": (20, False), }, weight_decay_mode=popart.WeightDecayMode.L2Regularization, mode=popart.AdamMode.Adam, clip_norm_settings=clipNormSettings) elif optType == 'lamb': return popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, False), }, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.Lamb, clip_norm_settings=clipNormSettings) elif optType == 'lambnobias': return popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, True), }, scaled_optimizer_state=False, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.LambNoBias, clip_norm_settings=clipNormSettings) else: raise Exception(f"Unrecognized optimizer type: '{optType}'")
def update_and_create(self, step, epoch): """ updates the learning rate and returns a new popart optimizer object: the learning-rate schedule used here is same as for RNN-T reference model """ new_lr = self.get_new_lr(step, epoch) logger.info("Setting learning-rate to {}".format(new_lr)) self.optimizer_options["defaultLearningRate"] = (new_lr, False) if self.optimizer_type == 'SGD': optimizer = popart.SGD(self.optimizer_options) elif self.optimizer_type == 'LAMB': if self.gradient_clipping_norm is None: optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb) else: optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb, clip_norm_settings=[popart.ClipNormSettings.clipAllWeights(self.gradient_clipping_norm)]) self.current_lr = new_lr return optimizer
def test_final_stage_recompute_0(): np.random.seed(0) gradient_accumulation = 5 batch_size = 1 hidden_size = 16 input_shape = [batch_size, hidden_size] weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype( np.float32) input_data = np.random.normal( 0, 0.02, [gradient_accumulation] + input_shape).astype(np.float32) builder = popart.Builder() x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") with builder.virtualGraph(0), builder.pipelineStage(0): weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") x = builder.aiOnnx.matmul([x_in, weight_1]) with builder.virtualGraph(1), builder.pipelineStage(1): weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2") x_recomp = builder.aiOnnx.matmul([x, weight_2]) # This MatMul should be recomputed x = builder.checkpointOutput([x_recomp])[0] weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3") # This MatMul should not be recomputed x_no_recomp = builder.aiOnnx.matmul([x, weight_3]) l1 = builder.aiGraphcore.l1loss([x_no_recomp], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [l1]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip opts.autoRecomputation = popart.RecomputationType.Pipeline opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) ''' Verify the the matmul in the main graphs is correct''' ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) for op in ir["maingraph"]: if x_recomp in map(lambda out: out["name"], op["outputs"]): assert op["attributes"]["recompute"] == "YES" elif x_no_recomp in map(lambda out: out["name"], op["outputs"]): assert op["attributes"]["recompute"] == "NO"
def run_test(mode=None, verify=None): builder = popart.Builder() x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") # We want a bwd pass that looks like: # # restore, op1, restore, op2, restore, op3 # # Where op1, op2 & op3 are gradient operations that # have implicit recompute inputs. with builder.virtualGraph(0), builder.pipelineStage(0): x = builder.aiOnnx.matmul([x_in, weight_1]) x = builder.checkpointOutput([x])[0] x = builder.aiOnnx.add([x, x]) # Gelu is a unary operation that takes the fwd input # activation. This satisfies our requirement above # of needing an implicit recompute input. x = builder.aiGraphcore.gelu([x]) x = builder.checkpointOutput([x])[0] x = builder.aiOnnx.add([x, x]) x = builder.aiGraphcore.gelu([x]) x = builder.checkpointOutput([x])[0] o = x with builder.virtualGraph(1), builder.pipelineStage(1): l1 = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [ o, popart.reservedGradientPrefix() + weight_1, ]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if mode is not None: opts.autoRecomputation = mode opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {x_in: input_data} stepio = popart.PyStepIO(inputs, anchors) for _ in range(10): session.run(stepio) if verify is not None: verify(session) return anchors
def test_adam_mixed_mode_1(tmpdir): #optimizer parameters defaultLearningRate0 = 0.005 defaultLearningRate5 = 0.0025 defaultBeta1 = 0.7 defaultBeta2 = 0.8 defaultWeightDecay = 0.1 defaultEps = 1e-6 lossScaling = 10.0 adam00 = popart.Adam({ "defaultLearningRate": (defaultLearningRate0, False), "defaultBeta1": (defaultBeta1, True), "defaultBeta2": (defaultBeta2, True), "defaultWeightDecay": (defaultWeightDecay, True), "defaultEps": (defaultEps, True), "lossScaling": (lossScaling, True), }) adam00.insertSpecific("w_0", {"beta1": (0.9, True), "beta2": (0.99, True)}) adam00.insertSpecific("b_0", {"beta1": (0.9, True), "beta2": (0.99, True)}) adam05 = popart.Adam({ "defaultLearningRate": (defaultLearningRate5, False), "defaultBeta1": (defaultBeta1, True), "defaultBeta2": (defaultBeta2, True), "defaultWeightDecay": (defaultWeightDecay, True), "defaultEps": (defaultEps, True), "lossScaling": (lossScaling, True), }) adam05.insertSpecific("w_0", {"beta1": (0.9, True), "beta2": (0.99, True)}) adam05.insertSpecific("b_0", {"beta1": (0.9, True), "beta2": (0.99, True)}) adam10 = popart.Adam({ "defaultLearningRate": (defaultLearningRate0, False), "defaultBeta1": (defaultBeta1, False), "defaultBeta2": (defaultBeta2, False), "defaultWeightDecay": (defaultWeightDecay, False), "defaultEps": (defaultEps, False), "lossScaling": (lossScaling, False), }) adam10.insertSpecific("w_0", { "beta1": (0.9, False), "beta2": (0.99, False) }) adam10.insertSpecific("b_0", { "beta1": (0.9, False), "beta2": (0.99, False) }) adam15 = popart.Adam({ "defaultLearningRate": (defaultLearningRate5, False), "defaultBeta1": (defaultBeta1, False), "defaultBeta2": (defaultBeta2, False), "defaultWeightDecay": (defaultWeightDecay, False), "defaultEps": (defaultEps, False), "lossScaling": (lossScaling, False), }) adam15.insertSpecific("w_0", { "beta1": (0.9, False), "beta2": (0.99, False) }) adam15.insertSpecific("b_0", { "beta1": (0.9, False), "beta2": (0.99, False) }) adam20 = popart.Adam({ "defaultLearningRate": (defaultLearningRate0, False), "defaultBeta1": (defaultBeta1, True), "defaultBeta2": (defaultBeta2, False), "defaultWeightDecay": (defaultWeightDecay, False), "defaultEps": (defaultEps, False), "lossScaling": (lossScaling, False), }) adam20.insertSpecific("w_0", { "beta1": (0.9, False), "beta2": (0.99, True) }) adam20.insertSpecific("b_0", { "beta1": (0.9, False), "beta2": (0.99, True) }) adam25 = popart.Adam({ "defaultLearningRate": (defaultLearningRate5, False), "defaultBeta1": (defaultBeta1, True), "defaultBeta2": (defaultBeta2, False), "defaultWeightDecay": (defaultWeightDecay, False), "defaultEps": (defaultEps, False), "lossScaling": (lossScaling, False), }) adam25.insertSpecific("w_0", { "beta1": (0.9, False), "beta2": (0.99, True) }) adam25.insertSpecific("b_0", { "beta1": (0.9, False), "beta2": (0.99, True) }) # Change Adam optimizer after 0 and 5 steps optMaps = [{ 0: adam00, 5: adam05 }, { 0: adam10, 5: adam15 }, { 0: adam20, 5: adam25 }] outlining = [True, True, True] run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float32) run_adam_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
offset=layer * elms * 4) assert np.array_equal(anchors[weightsIds[layer]].flatten(), saved_weights) optimizerInfos = [] # 1. SGD with momentum optimizerInfos.append((popart.SGD({ "defaultLearningRate": (0.2, True), "defaultMomentum": (0.5, True) }), [popart.reservedAcclPrefix()])) # 2. Adam optimizerInfos.append((popart.Adam({ "defaultLearningRate": (0.2, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True), "defaultWeightDecay": (0.5, True), "defaultEps": (1e-5, True), "lossScaling": (2, True) }), [ popart.reservedAccl1Prefix(), popart.reservedAccl2Prefix(), popart.reservedStepPrefix() ])) # 3. Adaptive optimizerInfos.append( (popart.Adaptive({"defaultLearningRate": (0.2, True)}, mode=popart.AdaptiveMode.CenteredRMSProp), [popart.reservedAccl1Prefix(), popart.reservedAccl2Prefix()]))
def run_model(tmpdir, model_file_name, schedule=popart.ExecutionPhaseSchedule.Interleaving, enable_outlining=False, stride=1, num_layers=5, dsize=128, batch_size=4, batch_serialize=1, batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic, num_iterations=5, num_replicas=2, optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})): np.random.seed(52125) builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [batch_size, dsize, dsize])) def add_layer(index, in_id): w = builder.addInitializedInputTensor( np.random.rand(dsize, dsize).astype(np.float32), f"W{index}") matmul_id = builder.aiOnnx.matmul([in_id, w]) return matmul_id out = ip l1 = "" final_loss = "" for i in range(num_layers): vgid = 0 with builder.executionPhase(i * stride), builder.virtualGraph(vgid): for j in range(3): out = add_layer(i, out) if i == num_layers - 1: with builder.executionPhase(i * stride), builder.virtualGraph(vgid): l1 = builder.aiGraphcore.l1loss([out], 0.1, popart.ReductionType.Sum) final_loss = builder.aiGraphcore.identityloss([l1]) anchorIds = [] builder.addOutputTensor(out) num_ipus = 1 dfAnchors = {} for anchorId in anchorIds: dfAnchors.update({anchorId: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() # Cycle counting opts.instrumentWithHardwareCycleCounter = True # Outlining opts.enableOutlining = enable_outlining opts.enableOutliningCopyCostPruning = False opts.outlineThreshold = -np.inf opts.aliasZeroCopy = enable_outlining # Replicated graphs opts.replicatedGraphCount = num_replicas opts.enableReplicatedGraphs = True if num_replicas > 1 else False # IO tiles opts.numIOTiles = 192 # Phased execution opts.executionPhaseSettings.phases = num_layers * stride opts.executionPhaseSettings.stages = 1 opts.executionPhaseSettings.schedule = schedule opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases # Recomputation opts.autoRecomputation = popart.RecomputationType.Standard opts.explicitRecomputation = True # Batch serialization if batch_serialize > 1: opts.batchSerializationSettings.factor = batch_serialize opts.batchSerializationSettings.concatOnVirtualGraphChange = False opts.batchSerializationSettings.concatOnExecutionPhaseChange = False opts.batchSerializationSettings.concatOnPipelineStageChange = False opts.batchSerializationSettings.batchSchedule = batch_schedule # Related execution phase setting opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand # Streaming memory offChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) offChipRtsLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) opts.activationTensorLocationSettings = offChipLocation opts.weightTensorLocationSettings = offChipRtsLocation opts.optimizerStateTensorLocationSettings = offChipRtsLocation proto = builder.getModelProto() with tu.create_test_device(num_replicas * num_ipus, pattern=popart.SyncPattern.Full) as device: session = popart.TrainingSession( fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), optimizer=optimizer, loss=final_loss, patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for i in range(num_iterations): ip_data = np.random.rand(num_replicas, batch_size, dsize, dsize).astype(np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) cycles = session.getCycleCount() print("anchors:") print(anchors) session.modelToHost(str(tmpdir / model_file_name)) return cycles
def run_test(compute_batch, batch_serialization_factor, accumulation_factor, replication_factor, explicit_loops): proto, data, xs, loss = model(compute_batch, batch_serialization_factor, accumulation_factor, replication_factor) options = popart.SessionOptions() patterns = popart.Patterns(popart.PatternsLevel.All) if optim is "SGD": optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "lossScaling": (20, False) }) elif optim is "SGDM1": optimizer = popart.SGD( { "defaultLearningRate": (0.1, False), "defaultMomentum": (0.9, False), "defaultDampening": (0.1, False), # to increase errors "lossScaling": (20, False), }, accumulatorAndMomentum=popart.SGDAccumulatorAndMomentum. Combined) elif optim is "SGDM2": optimizer = popart.SGD( { "defaultLearningRate": (0.1, False), "defaultMomentum": (0.9, False), "defaultDampening": (0.1, False), # to increase errors "lossScaling": (20, False), }, accumulatorAndMomentum=popart.SGDAccumulatorAndMomentum. Separate) elif optim is "ADAM": optimizer = popart.Adam( { "defaultLearningRate": (0.1, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "lossScaling": (20, False), }, mode=popart.AdamMode.AdamNoBias) # to increase errors if explicit_loops: options.enableExplicitMainLoops = True options.aliasZeroCopy = True options.explicitRecomputation = True options.useHostCopyOps = True options.batchSerializationSettings.factor = batch_serialization_factor if batch_serialization_factor > 1 and batchserial == "Loop": options.batchSerializationSettings.method = popart.BatchSerializationMethod.Loop options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Bwd options.accumulationAndReplicationReductionType = reduction if accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = accumulation_factor if reduction_type == "MeanRunning": options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Running if reduction_type == "MeanPost": options.meanAccumulationAndReplicationReductionStrategy = popart.MeanReductionStrategy.Post if replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = replication_factor device = tu.create_test_device(replication_factor, pattern=popart.SyncPattern.Full) dataFlow = popart.DataFlow( batches_per_step, {x: popart.AnchorReturnType("ALL") for x in xs}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) file_path = str(tmpdir / f"model_test.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) device.detach() return [anchors[x] for x in xs], post_proto
def replicated_tensor_sharding_core(): parser = argparse.ArgumentParser(description="Parse launch parameters.") parser.add_argument("--tensors", nargs="*") parser.add_argument("--optim", nargs="?") parser.add_argument("--tmpdir", nargs="?") parser.add_argument("--filename", nargs="?") parser.add_argument("--compute_batch", nargs="?") args = parser.parse_args(sys.argv[2:]) ipus_per_replica = 1 batches_per_step = 10 accumulation_factor = 4 compute_batch = int(args.compute_batch) hidden_size = 4 reduction = popart.ReductionType.Sum deviceInfo = popdist.popart.getDevice(ipus_per_replica) num_local_replicas = popdist.getNumLocalReplicas() num_total_replicas = popdist.getNumTotalReplicas() builder = popart.Builder() np.random.seed(12321) weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32) input_data = [] label_data = [] for i in range( 0, batches_per_step * num_local_replicas * accumulation_factor * compute_batch): np.random.seed(popdist.getInstanceIndex() + i * popdist.getNumInstances()) input_data += [np.random.rand(hidden_size).astype(np.float32)] label_data += [np.random.randint(0, hidden_size, size=1)] input_data = np.concatenate(input_data) label_data = np.concatenate(label_data) builder = popart.Builder() d0 = builder.addInputTensor( popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0") l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )), "l0") data = {} data[d0] = input_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1)) w0 = builder.addInitializedInputTensor(weight_data, 'weight0') x = builder.aiOnnx.matmul([d0, w0]) x = builder.aiOnnx.softmax([x]) data[l0] = label_data.reshape((batches_per_step, num_local_replicas, accumulation_factor, compute_batch, -1))\ .astype(np.uint32) loss = builder.aiGraphcore.nllloss([x, l0], reduction=reduction, debugContext='loss') proto = builder.getModelProto() dataFlow = popart.DataFlow( batches_per_step, {av: popart.AnchorReturnType("ALL") for av in [x, loss]}) opts = popart.SessionOptions() if accumulation_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accumulation_factor opts.explicitRecomputation = True opts.enableExplicitMainLoops = True opts.useHostCopyOps = True # Let popdist handle distributed settings, such as: # opts.enableDistributedReplicatedGraphs # opts.globalReplicaOffset # opts.globalReplicationFactor popdist.popart.configureSessionOptions(opts) for tensor in ["weight", "optimizerState", "accumulator"]: userOption = tensor + "TensorLocationSettings" print( f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}" ) locationSetting = getattr(opts, userOption) locationSetting.minElementsForOffChip = 0 locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas if tensor in args.tensors: locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On if num_total_replicas > num_local_replicas: locationSetting.location.shardingDomain = popart.CommGroup( popart.CommGroupType.Consecutive, num_local_replicas) setattr(opts, userOption, locationSetting) if args.optim == "Adam": optimizer = popart.Adam( { "defaultLearningRate": (0.01, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "defaultEps": (1e-06, False), "defaultWeightDecay": (0.1, False), "lossScaling": (10, False), }, weight_decay_mode=popart.WeightDecayMode.Decay, mode=popart.AdamMode.LambNoBias) if args.optim == "SGD": optimizer = popart.ConstSGD(0.01) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=deviceInfo, userOptions=opts, loss=loss, optimizer=optimizer) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) tmp_path = Path(args.tmpdir) tmp_path.mkdir(parents=True, exist_ok=True) file_path = str(tmp_path / args.filename) session.modelToHost(file_path) post_proto = onnx.load(file_path)
# beta 2 (Adam) b2 = 0.999 # setting this to, say, 100, and the test fails, see T24563 testTilesPerIPU = 1216 sgd_optimizer = popart.SGD({ "defaultLearningRate": (lr, False), "defaultWeightDecay": (wd, False) }) adam_optimizer = popart.Adam({ "defaultLearningRate": (lr, False), "defaultBeta1": (b1, False), "defaultBeta2": (b2, False), "defaultWeightDecay": (wd, False), "defaultEps": (1e-6, True), }) grad_accl_prefix = popart.reservedAcclPrefix() def get_micro_batch_size(accum_factor): """ no data replication, so micro batch size = batch size / accumlation factor """ if (batch_size % accum_factor is not 0): raise RuntimeError("accum_factor is not a factor of batch_size") micro_batch_size = batch_size // accum_factor
def run_test(mode=None, verify=None): builder = popart.Builder() def norm(input_x): gamma = builder.addInitializedInputTensor( np.ones(hidden_size, np.float32), "Gamma") beta = builder.addInitializedInputTensor( np.zeros(hidden_size, np.float32), "Beta") return builder.aiGraphcore.groupnormalization( [input_x, gamma, beta], 1)[0] x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2") weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3") with builder.virtualGraph(0), builder.pipelineStage(0): x_0 = builder.aiOnnx.matmul([x_in, weight_1]) x_0 = norm(x_0) # If recomputeOutputs was used directly on `x_0` all 3 outputs # of groupnormalization would be stashed. # By using a checkpointOutput only 1 output will be stashed and the # rest will be recomputed. x_0 = builder.checkpointOutput([x_0])[0] x_1 = builder.aiOnnx.matmul([x_0, weight_2]) x_1 = norm(x_1) x_1 = builder.aiOnnx.add([x_0, x_1]) # This checkpoint should be redundant as x_1 will be stashed # at the start of stage1 on ipu1. x_1 = builder.checkpointOutput([x_1])[0] with builder.virtualGraph(1), builder.pipelineStage(1): o = builder.aiOnnx.matmul([x_1, weight_3]) l1 = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [ o, popart.reservedGradientPrefix() + weight_1, popart.reservedGradientPrefix() + weight_2, popart.reservedGradientPrefix() + weight_3, ]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if mode is not None: opts.autoRecomputation = mode opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {x_in: input_data} stepio = popart.PyStepIO(inputs, anchors) for _ in range(10): session.run(stepio) if verify is not None: verify(session, x_0) return anchors
def create(self): self.iteration.learning_rate = self.optimizer_options[ "defaultLearningRate"][0] if self.opt_type == "SGD": optimizer = popart.SGD(self.optimizer_options) elif self.opt_type == "ADAM": optimizer = popart.Adam(self.optimizer_options) elif self.opt_type == "ADAM_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.AdamNoBias) elif self.opt_type == "LAMB": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.Lamb) elif self.opt_type == "LAMB_NO_BIAS": optimizer = popart.Adam(self.optimizer_options, mode=popart.AdamMode.LambNoBias) projection_scale_added = False weight_decay_tensor_list = [] if self.execution_mode == "PIPELINE": for stage in self.tensors: specific_parameters = {} if self.lr_scaling: default_lr, lr_is_const = self.optimizer_options[ "defaultLearningRate"] specific_parameters["learningRate"] = ( default_lr * self.pipeline_stage_lr_scaling[stage], lr_is_const) if self.momentum_scaling: # Momentum values are scaled inverse to the pipeline_stage if self.option_values["defaultMomentum"] != 0: # This arithmetic will create FP rounding errors if momentum == 0. momentum = 1 - ( (1 - self.option_values["defaultMomentum"]) * self.pipeline_stage_momentum_scaling[stage]) else: momentum = 0 specific_parameters["momentum"] = (momentum, True) if self.option_values["defaultDampening"] != 0: dampening = 1 - ( (1 - self.option_values["defaultDampening"]) * self.pipeline_stage_dampening_scaling[stage]) else: dampening = 0 specific_parameters["dampening"] = (dampening, True) for tensor_id in self.tensors[stage]: if self.include_for_weight_decay(tensor_id): specific_parameters["weightDecay"] = ( self.weight_decay, True) weight_decay_tensor_list.append(tensor_id) if self.squad_lr_scaling and "Squad" in tensor_id: logger.debug( f"Setting SQuAD LR scaling for tensor [{tensor_id}]: {self.squad_lr_scale}" ) lr = specific_parameters.get( "learningRate", self.optimizer_options["defaultLearningRate"]) params = specific_parameters.copy() params["learningRate"] = (lr[0] * self.squad_lr_scale, lr[1]) optimizer.insertSpecific(tensor_id, params) else: optimizer.insertSpecific(tensor_id, specific_parameters) else: for tensor_id in self.tensors[0]: if self.include_for_weight_decay(tensor_id): specific_parameters = { "weightDecay": (self.weight_decay, True) } weight_decay_tensor_list.append(tensor_id) optimizer.insertSpecific(tensor_id, specific_parameters) if len(weight_decay_tensor_list) != 0: logger.debug( f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}" ) return optimizer
def test_replicated_lamb_weight_update(tmpdir, isConst, reduction): # Test both const & non-const optimizer parameters optimizer_dict = { "defaultLearningRate": (0.005, isConst), "defaultBeta1": (0.7, isConst), "defaultBeta2": (0.8, isConst), "defaultWeightDecay": (0.1, isConst), "defaultEps": (1e-6, isConst), "lossScaling": (10.0, isConst), } # Off-chip, but no RTS (1x replica) run_model(tmpdir, 'phased.onnx', execution_mode="phased", batch_size=4, num_replicas=1, num_iterations=5, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation, reduction=reduction) # Off-chip, but no RTS (2x replicas) run_model(tmpdir, 'phased_replicated.onnx', execution_mode="phased", batch_size=2, num_replicas=2, num_iterations=5, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=offChipLocation, reduction=reduction) # Weights and optimizer off-chip, RTS run_model(tmpdir, 'phased_replicated_rws.onnx', execution_mode="phased", batch_size=2, num_replicas=2, num_iterations=5, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipRtsLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=offChipLocation, reduction=reduction) # Weights and optimizer off-chip, accumulator off chip, RTS run_model(tmpdir, 'phased_replicated_rws_acc.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, enable_accum=True, accum_factor=2, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipRtsLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=offChipLocation, reduction=reduction) # Weights on-chip, non-RTS, optimizer state off-chip, RTS run_model(tmpdir, 'phased_replicated_rws_acc_nw.onnx', execution_mode="phased", batch_size=1, num_replicas=2, num_iterations=5, enable_accum=True, accum_factor=2, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=onChipLocation, reduction=reduction) phased = onnx.load(str(tmpdir / 'phased.onnx')) phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx')) phased_replicated_rws = onnx.load( str(tmpdir / 'phased_replicated_rws.onnx')) phased_replicated_rws_acc = onnx.load( str(tmpdir / 'phased_replicated_rws_acc.onnx')) phased_replicated_rws_acc_nw = onnx.load( str(tmpdir / 'phased_replicated_rws_acc_nw.onnx')) check_model(phased, phased_replicated) check_model(phased, phased_replicated_rws) check_model(phased, phased_replicated_rws_acc) check_model(phased, phased_replicated_rws_acc_nw)
def run_test(compute_batch, batch_serialization_factor, accumulation_factor, replication_factor): proto, data, xs, loss = model(compute_batch, batch_serialization_factor, accumulation_factor, replication_factor) options = popart.SessionOptions() patterns = popart.Patterns() if optim is "SGD": optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "lossScaling": (20, False) }) elif optim is "SGDM": optimizer = popart.SGD({ "defaultLearningRate": (0.1, False), "defaultMomentum": (0.9, False), "defaultDampening": (0.1, False), # to increase errors "lossScaling": (20, False), }) elif optim is "ADAM": optimizer = popart.Adam( { "defaultLearningRate": (0.1, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "lossScaling": (20, False), }, mode=popart.AdamMode.AdamNoBias) # to increase errors elif optim is "LAMB": optimizer = popart.Adam( { "defaultLearningRate": (0.1, False), "defaultBeta1": (0.9, False), "defaultBeta2": (0.999, False), "lossScaling": (20, False), }, mode=popart.AdamMode.LambNoBias) # to increase errors options.batchSerializationSettings.factor = batch_serialization_factor if accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = accumulation_factor options.accumulationReductionType = reduction if replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = replication_factor device = tu.create_test_device(replication_factor, pattern=popart.SyncPattern.Full) dataFlow = popart.DataFlow( batches_per_step, {x: popart.AnchorReturnType("ALL") for x in xs}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) file_path = str(tmpdir / f"model_test.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) device.detach() return [anchors[x] for x in xs], post_proto
def test_pipelined_streaming_lamb(tmpdir): optimizer_dict = { "defaultLearningRate": (0.005, True), "defaultBeta1": (0.7, True), "defaultBeta2": (0.8, True), "defaultWeightDecay": (0.1, True), "defaultEps": (1e-6, True), "lossScaling": (10.0, True), } run_model(tmpdir, 'normal.onnx', execution_mode="normal", num_layers=2, batch_size=12, num_replicas=1, num_iterations=5, enable_accum=False, accum_factor=1, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=onChipLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'pipelined.onnx', execution_mode="pipelined", num_layers=2, batch_size=2, num_replicas=1, num_iterations=5, enable_accum=True, accum_factor=6, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=onChipLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'pipelined_streaming.onnx', execution_mode="pipelined", num_layers=2, batch_size=2, num_replicas=1, num_iterations=5, enable_accum=True, accum_factor=6, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'pipelined_streaming_rep.onnx', execution_mode="pipelined", num_layers=2, batch_size=1, num_replicas=2, num_iterations=5, enable_accum=True, accum_factor=6, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'pipelined_streaming_rep_rts.onnx', execution_mode="pipelined", num_layers=2, batch_size=1, num_replicas=2, num_iterations=5, enable_accum=True, accum_factor=6, optimizer=popart.Adam(optimizer_dict, popart.AdamMode.Lamb), activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=offChipRtsLocation, accumulator_tensor_location_settings=onChipLocation) normal = onnx.load(str(tmpdir / 'normal.onnx')) pipelined = onnx.load(str(tmpdir / 'pipelined.onnx')) pipelined_streaming = onnx.load(str(tmpdir / 'pipelined_streaming.onnx')) pipelined_streaming_rep = onnx.load( str(tmpdir / 'pipelined_streaming_rep.onnx')) pipelined_streaming_rep_rts = onnx.load( str(tmpdir / 'pipelined_streaming_rep_rts.onnx')) check_model(normal, pipelined) check_model(normal, pipelined_streaming) check_model(normal, pipelined_streaming_rep) check_model(normal, pipelined_streaming_rep_rts)
if conf.optimizer == 'SGD': optimizer_dict = {"defaultLearningRate": (conf.init_lr, False), "defaultWeightDecay": (0, True)} logger.info("Creating SGD optimizer: {}".format(json.dumps(optimizer_dict))) optimizer = popart.SGD(optimizer_dict) elif conf.optimizer == 'Adam': optimizer_dict = { "defaultLearningRate": (conf.init_lr, True), "defaultBeta1": (conf.beta1, True), "defaultBeta2": (conf.beta2, True), "defaultWeightDecay": (0.0, True), "defaultEps": (conf.adam_eps, True), "lossScaling": (1.0, True), } logger.info("Creating Adam optimizer: {}".format(json.dumps(optimizer_dict))) optimizer = popart.Adam(optimizer_dict) else: logger.info("Not a valid optimizer option: {}".format(conf.optimizer)) sys.exit(-1) # create training session logger.info("Creating the training session") training_session, anchors = \ conf_utils.create_session_anchors(proto, ctc_neg_log_likelihood, device, dataflow, session_options, training=True, optimizer=optimizer) logger.info("Sending weights from Host")
def bwd_graph(popart_model, torch_model, popart_loss_fn, torch_loss_fn, mapping=None, transform=None, replication_factor=1, replicated_tensor_sharding=False, opt_type="SGD"): np.random.seed(1984) random.seed(1984) torch.manual_seed(1984) # ------------------- PopART -------------------- config = popart_model.config builder = popart_model.builder sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = builder.addInputTensor(sequence_info) positions = builder.addInputTensor(sequence_info) segments = builder.addInputTensor(sequence_info) data = { indices: np.random.randint(0, config.vocab_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), positions: np.random.randint(0, config.sequence_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), segments: np.random.randint(0, 2, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32) } num_reps = 5 output = popart_model.build_graph(indices, positions, segments) ipus = popart_model.total_ipus loss = popart_loss_fn(output) proto = builder.getModelProto() if opt_type == "SGD": optimizer = popart.ConstSGD(1e-3) elif opt_type == "LAMB": optMap = { "defaultLearningRate": (1e-3, True), "defaultBeta1": (0.9, True), "defaultBeta2": (0.999, True), "defaultWeightDecay": (0.0, True), "maxWeightNorm": (10.0, True), "defaultEps": (1e-8, True), "lossScaling": (1.0, True), } optimizer = popart.Adam(optMap, mode=popart.AdamMode.Lamb) elif opt_type == "LAMB_NO_BIAS": optMap = { "defaultLearningRate": (1, False), "defaultBeta1": (0, False), "defaultBeta2": (0, False), "defaultWeightDecay": (0.0, False), "defaultEps": (1e-8, False), "lossScaling": (1.0, False), } optimizer = popart.Adam(optMap, mode=popart.AdamMode.LambNoBias) else: raise ValueError(f"Unknown opt_type={opt_type}") outputs, post_proto = run_py( proto, data, output, loss=loss, optimizer=optimizer, replication_factor=replication_factor, replicated_tensor_sharding=replicated_tensor_sharding, ipus=ipus, num_reps=num_reps) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = { "input_ids": data[indices].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "position_ids": data[positions].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "token_type_ids": data[segments].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32) } torch_to_onnx = get_mapping(config, init=mapping) transform_weights = get_transform(config, init=transform) # ------------------- PyTorch ------------------------- # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, torch_to_onnx, transform_weights) if opt_type == "SGD": optim = torch.optim.SGD(torch_model.parameters(), 1e-3, weight_decay=0.0, momentum=0.0) elif opt_type == "LAMB": optim = torch_lamb.Lamb(torch_model.parameters(), lr=1e-3, weight_decay=0.0, biasCorrection=True) for _ in range(num_reps): torch_outputs = torch_model( **{k: torch.from_numpy(t).long() for k, t in inputs.items()}) torch_loss = torch_loss_fn(torch_outputs) torch_loss.backward() optim.step() optim.zero_grad() check_tensors([output.detach().numpy() for output in torch_outputs], outputs, margin=1.5e-06) check_model(torch_model, post_proto, torch_to_onnx, transform_weights, margin=5e-5)
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) # Required extraPatterns = [] if include_patterns: extraPatterns += ["TiedGatherPattern", "TiedGatherAccumulatePattern"] patterns = popart.Patterns() for extraPattern in extraPatterns: patterns.enablePattern(extraPattern, True) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, } if optim == "Lamb": optimizer = popart.Adam( { "defaultLearningRate": (0.1, True), "lossScaling": (20, False), }, mode=popart.AdamMode.LambNoBias ) # NoBias to increase the error of incorrect gradients user_options[ "optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0) else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, False) }) if train: return run_py(proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py(proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)