def test_accumulator_tensor_location_settings_plus_override(): # Check optimizer state tensor location settings work optimizer_with_state = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1'], check_offchip=['Accl___W2', 'Accl___W0']) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W2', 'Accl___W0'], check_offchip=['Accl___W1'])
def test_activation_tensor_location_settings_plus_override(): # Check weight tensor location settings work. ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'MatMul:0/1__t6': popart.TensorLocation(popart.TensorStorage.OnChip) }) check_ir(ir, check_onchip=['MatMul:0/1__t6'], check_offchip=['MatMul:0__t3']) ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'MatMul:0/1__t6': popart.TensorLocation(popart.TensorStorage.OffChip) }) check_ir(ir, check_onchip=['MatMul:0__t3'], check_offchip=['MatMul:0/1__t6'])
def test_weight_tensor_location_settings_plus_override(): # Check weight tensor location settings work. ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'W2': popart.TensorLocation(popart.TensorStorage.OnChip) }) check_ir(ir, check_onchip=['W2'], check_offchip=['W0', 'W1']) ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'W1': popart.TensorLocation(popart.TensorStorage.OffChip) }) check_ir(ir, check_onchip=['W0', 'W2'], check_offchip=['W1'])
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) patterns = popart.Patterns() patterns.enablePattern("TiedGatherPattern", include_patterns) patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, "accumulationAndReplicationReductionType": popart.ReductionType.Mean, "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running } if optim == "Lamb": optimizer = popart.Adam({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True), "lossScaling": (20, True), }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorLocation( popart.TensorStorage.OffChip, popart.ReplicatedTensorSharding.On), 0, 0) user_options["enableReplicatedGraphs"] = True user_options["replicatedGraphCount"] = 2 ipus = 2 else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, True)}) ipus = 1 if train: return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py( proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)
def bert_optimizer_location_settings(args): storage = popart.TensorStorage.OnChip if args.optimizer_state_offchip: storage = popart.TensorStorage.OffChip rts = popart.ReplicatedTensorSharding.Off if args.replicated_tensor_sharding: rts = popart.ReplicatedTensorSharding.On return popart.TensorLocationSettings(popart.TensorLocation(storage, rts))
def set_ema_weights_offchip(session_options, ema_weight_names): """ Sets the tensor locations of EMA weights to be off-chip """ tensor_location_override_dict = dict() for nname in network_names: for _, ema_wname in ema_weight_names[nname]: tensor_location_override_dict[ema_wname] = popart.TensorLocation( popart.TensorStorage.OffChip) logger.info("Setting tensor-location for {} to be OffChip".format( ema_wname)) session_options.tensorLocationSettingsOverride = tensor_location_override_dict return
def set_phased_options(options, engine_options, model, args): options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.enableOutliningCopyCostPruning = False options.outlineThreshold = -np.inf options.outlineSequenceBreakCost = 100000.0 options.executionPhaseSettings.phases = model.total_execution_phases options.batchSerializationSettings.factor = args.batch_serialize options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Fwd options.batchSerializationSettings.concatOnVirtualGraphChange = False options.batchSerializationSettings.concatOnExecutionPhaseChange = False options.batchSerializationSettings.concatOnPipelineStageChange = False options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute options.autoRecomputation = popart.RecomputationType.Standard options.explicitRecomputation = True options.aliasZeroCopy = True varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO varLocation.replicatedTensorSharding = ( popart.ReplicatedTensorSharding.On if args.replicated_tensor_sharding else popart.ReplicatedTensorSharding.Off) options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.activationTensorLocationSettings.location = varLocation if args.tensor_storage_onchip: options.weightTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.accumulatorTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.executionPhaseSettings.activationIOSchedule = io_schedule( args.activation_io_schedule) options.executionPhaseSettings.weightIOSchedule = io_schedule( args.weight_io_schedule) options.executionPhaseSettings.schedule = optimizer_schedule( args.optimizer_schedule) options.numIOTiles = args.num_io_tiles engine_options["target.syncReplicasIndependently"] = "false" if args.activations_on_chip: options.activationTensorLocationSettings = popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0)
def test_attention_streamingmemory(tmpdir): np.random.seed(0XDEAD1337) batches_per_step = 5 batch_size = 8 hidden_size = 16 sequence_length = 8 attention_heads = 4 qkv_length = hidden_size / attention_heads input_shape = [batch_size * sequence_length, hidden_size] mask_shape = [batch_size, 1, 1, sequence_length] qkv_data = np.random.normal( 0, 0.02, [hidden_size, hidden_size * 3]).astype(np.float32) r = np.arange(0, sequence_length) r = np.reshape(batch_size * [r], mask_shape) masks = [] for i in range(batches_per_step): masks.append(np.less(r, i).astype(np.float32)) mask_data = (1 - np.stack(masks)) * -1000.0 input_data = np.random.normal(0, 0.02, [batches_per_step] + input_shape).astype(np.float32) def run_test(index, options): per_replica_batch_size = batch_size / options["replication"] model_input_shape = input_shape[:] model_input_shape[0] = int(model_input_shape[0] / options["replication"]) model_mask_shape = mask_shape[:] model_mask_shape[0] = int(model_mask_shape[0] / options["replication"]) stride = 2 // options["stages"] if "stride" in options and options["stride"]: stride = options["stride"] builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) mask = builder.addInputTensor( popart.TensorInfo("FLOAT", model_mask_shape), "mask") x_in = builder.addInputTensor( popart.TensorInfo("FLOAT", model_input_shape), "x_in") anchors = {} x = x_in for i in range(options["numLayers"]): qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}") anchors[popart.reservedGradientPrefix() + qkv] = popart.AnchorReturnType("All") vgid = (i % options["stages"]) if options["phasedExecution"] else i with builder.virtualGraph(vgid), builder.executionPhase(i * stride): x = builder.aiOnnx.matmul([x, qkv]) x = attention_onnx(builder, x, mask, per_replica_batch_size, sequence_length, hidden_size, attention_heads, qkv_length) vgid = ((options["numLayers"] - 1) % options["stages"] ) if options["phasedExecution"] else options["numLayers"] - 1 with builder.virtualGraph(vgid), builder.executionPhase( (options["numLayers"] - 1) * stride): l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum) proto = builder.getModelProto() gradient_keys = list(anchors.keys()) anchors[x] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batches_per_step, anchors) opts = popart.SessionOptions() opts.executionPhaseSettings.stages = options["stages"] opts.executionPhaseSettings.phases = ( options["numLayers"] * stride if options["phasedExecution"] else 0) opts.enableOutlining = options["outlining"] if "phaseSchedule" in options: opts.executionPhaseSettings.schedule = options["phaseSchedule"] # Phased execution currently does its own recompute annotations opts.autoRecomputation = (popart.RecomputationType.Standard if options["explicitRecomputation"] else popart.RecomputationType.NoRecompute) opts.outlineThreshold = -np.inf opts.enableOutliningCopyCostPruning = False opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases if options["phasedExecution"] else popart.VirtualGraphMode.Manual) opts.explicitRecomputation = options["explicitRecomputation"] opts.aliasZeroCopy = options["aliasZeroCopy"] opts.batchSerializationSettings.factor = options["batchSerialize"] if "batchSchedule" in options: opts.batchSerializationSettings.batchSchedule = options[ "batchSchedule"] if "batchConcat" in options: # Do not concatenate the batch across phases and virtual graphs # (causes more, smalle transfers but allows for individual sub-batch # elements to be transferred) opts.batchSerializationSettings.concatOnVirtualGraphChange = options[ "batchConcat"] opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[ "batchConcat"] # Wait with loading activations until they are required opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand if "tensorLocationSettings" in options and options[ "tensorLocationSettings"]: opts.activationTensorLocationSettings = options[ "tensorLocationSettings"] opts.weightTensorLocationSettings = options[ "tensorLocationSettings"] opts.optimizerStateTensorLocationSettings = options[ "tensorLocationSettings"] opts.accumulatorTensorLocationSettings = options[ "tensorLocationSettings"] if "weightTensorLocationSettings" in options and options[ "weightTensorLocationSettings"]: opts.weightTensorLocationSettings = options[ "weightTensorLocationSettings"] if options["replication"] > 1: opts.replicatedGraphCount = options["replication"] opts.enableReplicatedGraphs = True if "ioTiles" in options: opts.numIOTiles = options["ioTiles"] pat = popart.Patterns(popart.PatternsLevel.Default) if options["phasedExecution"]: numIpus = options["stages"] else: numIpus = options["numLayers"] + 1 if options["replication"] > 1: numIpus = numIpus * options["replication"] device = tu.create_test_device(numIpus, pattern=popart.SyncPattern.Full) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.ConstSGD(0.1), patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for k, v in anchors.items(): print(f"anchor_before {k}={v.shape}") inputs = {x_in: input_data, mask: mask_data} stepio = popart.PyStepIO(inputs, anchors) for __ in range(10): session.run(stepio) session.modelToHost( str(tmpdir / f"streamingmemory_attention_{index}.onnx")) if options["replication"] > 1: for k, v in anchors.items(): if k in gradient_keys: # The gradient anchors will have an additional replication axis. anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0) else: # Output tensor needs reshaping. anchors[k] = np.reshape(anchors[k], [ batches_per_step, sequence_length * batch_size, hidden_size ]) for k, v in anchors.items(): print(f"anchor_after {k}={v.shape}") return anchors test_results = [] # AliasZeroCopy only supported with explicit recomputation, but not with # standard recomputation # Phased execution only supported with explicit recomputaton, but not with # standard recomputation test_variants = [] defaultOffChip = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) ioOffChip = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) # Ground truth variant test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": False, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": False, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 4, "replication": 1, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) # Test batch serialized single device per replica execution, where all # streaming memory traffic goes through IO tiles, and activations are # stored and loaded one-by-one test_variants.append({ "stages": 1, "stride": 4, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchConcat": False, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test batch serialized single device per replica execution, where all # streaming memory traffic goes through IO tiles, and loading of the next # phase happens before storing the current phase test_variants.append({ "stages": 1, "stride": 1, "numLayers": 3, "phasedExecution": True, "phaseSchedule": popart.ExecutionPhaseSchedule.BatchClusteredIO, "outlining": False, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchConcat": True, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test a variety of batch serialisation schedules. for batchSchedule in [ popart.BatchSerializationBatchSchedule.Scheduler, popart.BatchSerializationBatchSchedule.Isomorphic, popart.BatchSerializationBatchSchedule.OverlapOnIo, popart.BatchSerializationBatchSchedule.OverlapOnCompute, ]: test_variants.append({ "stages": 1, "stride": 4, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchSchedule": batchSchedule, "batchConcat": False, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test replicated tensor sharding + on chip (no outlining). test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 2, "tensorLocationSettings": defaultOffChip, "weightTensorLocationSettings": popart.TensorLocationSettings(location=popart.TensorLocation( storage=popart.TensorStorage.OnChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) }) # Test replicated tensor sharding + off chip (no outlining). test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 2, "tensorLocationSettings": defaultOffChip, "weightTensorLocationSettings": popart.TensorLocationSettings(location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) }) index = 0 for test_option in test_variants: print(f"Running {index}: {test_option}") test_results.append(run_test(index, test_option)) index += 1 gt_onnx = onnx.load(str(tmpdir / f"streamingmemory_attention_0.onnx")) for i in range(1, index): print(f"Testing run {i}: {test_variants[i]}") for key in test_results[0].keys(): assert np.all( np.isclose(test_results[0][key], test_results[i][key], equal_nan=False)) val_onnx = onnx.load( str(tmpdir / f"streamingmemory_attention_{i}.onnx")) for j in range(len(gt_onnx.graph.initializer)): print(f"Checking initializer {j}") gt = gt_onnx.graph.initializer[j] gt = numpy_helper.to_array(gt) val = val_onnx.graph.initializer[j] val = numpy_helper.to_array(val) assert np.allclose(gt, val, equal_nan=False)
def run_test(aliaszerocopy): proto, data, x, loss = model() options = popart.SessionOptions() patterns = popart.Patterns() optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True) }) options.enableOutlining = True options.outlineThreshold = -np.inf options.enableOutliningCopyCostPruning = False options.autoRecomputation = popart.RecomputationType.Standard options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.explicitRecomputation = True options.aliasZeroCopy = aliaszerocopy options.executionPhaseSettings.phases = 5 varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.activationTensorLocationSettings.location = varLocation tempDir = tempfile.TemporaryDirectory() options.engineOptions["autoReport.directory"] = tempDir.name options.engineOptions["autoReport.all"] = "true" request_ipus = 2 device = tu.create_test_device(2, pattern=popart.SyncPattern.Full) dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO(data, anchors) session.run(stepio) file_path = str(tmpdir / f"aliaszerocopy_model_test.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) device.detach() report = session.getReport() max_tile_memory = max([ tile.memory.total.excludingGaps for tile in report.compilation.tiles ]) total_memory = np.sum([ tile.memory.total.excludingGaps for tile in report.compilation.tiles ]) return anchors[x], post_proto, total_memory
for i in range(len(lhs_model.graph.initializer)): lhs = lhs_model.graph.initializer[i] for j in range(len(rhs_model.graph.initializer)): rhs = rhs_model.graph.initializer[j] if (rhs.name == lhs.name): print(f'Checking initializer {i} ({lhs.name} - {rhs.name})') lhsa = numpy_helper.to_array(lhs) rhsa = numpy_helper.to_array(rhs) assert np.allclose(lhsa, rhsa, rtol=1.e-4, atol=1.e-5) # Standard OnChip settings onChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OnChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) # Standard OffChip settings offChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2)
def run_model(tmpdir, model_file_name, schedule=popart.ExecutionPhaseSchedule.Interleaving, enable_outlining=False, stride=1, num_layers=5, dsize=128, batch_size=4, batch_serialize=1, batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic, num_iterations=5, num_replicas=2, optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})): np.random.seed(52125) builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [batch_size, dsize, dsize])) def add_layer(index, in_id): w = builder.addInitializedInputTensor( np.random.rand(dsize, dsize).astype(np.float32), f"W{index}") matmul_id = builder.aiOnnx.matmul([in_id, w]) return matmul_id out = ip l1 = "" final_loss = "" for i in range(num_layers): vgid = 0 with builder.executionPhase(i * stride), builder.virtualGraph(vgid): for j in range(3): out = add_layer(i, out) if i == num_layers - 1: with builder.executionPhase(i * stride), builder.virtualGraph(vgid): l1 = builder.aiGraphcore.l1loss([out], 0.1, popart.ReductionType.Sum) final_loss = builder.aiGraphcore.identityloss([l1]) anchorIds = [] builder.addOutputTensor(out) num_ipus = 1 dfAnchors = {} for anchorId in anchorIds: dfAnchors.update({anchorId: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() # Cycle counting opts.instrumentWithHardwareCycleCounter = True # Outlining opts.enableOutlining = enable_outlining opts.enableOutliningCopyCostPruning = False opts.outlineThreshold = -np.inf opts.aliasZeroCopy = enable_outlining # Replicated graphs opts.replicatedGraphCount = num_replicas opts.enableReplicatedGraphs = True if num_replicas > 1 else False # IO tiles opts.numIOTiles = 192 # Phased execution opts.executionPhaseSettings.phases = num_layers * stride opts.executionPhaseSettings.stages = 1 opts.executionPhaseSettings.schedule = schedule opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases # Recomputation opts.autoRecomputation = popart.RecomputationType.Standard opts.explicitRecomputation = True # Batch serialization if batch_serialize > 1: opts.batchSerializationSettings.factor = batch_serialize opts.batchSerializationSettings.concatOnVirtualGraphChange = False opts.batchSerializationSettings.concatOnExecutionPhaseChange = False opts.batchSerializationSettings.concatOnPipelineStageChange = False opts.batchSerializationSettings.batchSchedule = batch_schedule # Related execution phase setting opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand # Streaming memory offChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) offChipRtsLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) opts.activationTensorLocationSettings = offChipLocation opts.weightTensorLocationSettings = offChipRtsLocation opts.optimizerStateTensorLocationSettings = offChipRtsLocation proto = builder.getModelProto() with tu.create_test_device(num_replicas * num_ipus, pattern=popart.SyncPattern.Full) as device: session = popart.TrainingSession( fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), optimizer=optimizer, loss=final_loss, patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for i in range(num_iterations): ip_data = np.random.rand(num_replicas, batch_size, dsize, dsize).astype(np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) cycles = session.getCycleCount() print("anchors:") print(anchors) session.modelToHost(str(tmpdir / model_file_name)) return cycles
def __init__(self, name: str, input_size, hidden_size, num_heads, serialize_matmul, available_memory_proportion, epsilon, dropout, dropout_prob, attn_dropout, attn_dropout_prob, batch_size, sequence_length, dtype, task, num_mask_tokens, split_qkv=False, residual=True, prefetch_masks=True, use_default_mem_proportion=True, mask=None, **kwargs): if split_qkv: params = [ Parameter(name='Q', shape=[input_size, hidden_size], value=None), Parameter(name='K', shape=[input_size, hidden_size], value=None), Parameter(name='V', shape=[input_size, hidden_size], value=None), Parameter(name='Out', shape=[hidden_size, input_size], value=None) ] else: params = [ Parameter(name='QKV', shape=[input_size, 3 * hidden_size], value=None), Parameter(name='Out', shape=[hidden_size, input_size], value=None) ] scope_provider = kwargs['scope_provider'] super(Attention, self).__init__(params=params, scope=scope_provider.get_scope(name, 'next'), dtype=dtype, **kwargs) self.num_heads = num_heads self.hidden_size = hidden_size self.serialize_matmul = serialize_matmul self.available_memory_proportion = available_memory_proportion self.use_default_mem_proportion = use_default_mem_proportion self.split_qkv = split_qkv self.batch_size = batch_size self.seq_len = sequence_length if hidden_size % num_heads != 0: raise ValueError('Hidden size must be a multiple of num_heads') self.qkv_length = hidden_size // num_heads self.dtype = dtype self.residual = residual self.task = task self.num_mask_tokens = num_mask_tokens self.mask = mask self.prefetch_masks = prefetch_masks if prefetch_masks: additional_scopes = [ self.builder.recomputeOutput(popart.RecomputeType.Checkpoint), self.builder.outputTensorLocation( popart.TensorLocation(popart.TensorStorage.OnChip)) ] self.mask_execution_phase = scope_provider.get_scope( 'Mask', 'prev').execution_phase % 2 self.mask_scope = scope_provider.get_scope( 'Mask', self.mask_execution_phase, additional_scopes=additional_scopes) else: self.mask_scope = scope_provider.get_scope('Mask', 'prev') if self.residual: self.norm = Norm(scope_provider.get_scope('Norm', 'prev'), hidden_size, epsilon, dtype, **kwargs) if dropout: self.dropout = Dropout(scope_provider.get_scope('Dropout', 'prev'), dropout_prob, **kwargs) else: self.dropout = lambda x: x if attn_dropout: self.attn_dropout = Dropout( scope_provider.get_scope('AttnDropout', 'prev'), attn_dropout_prob, **kwargs) else: self.attn_dropout = lambda x: x self.total_execution_phases = self.total_phases()
def bert_session_options(args, model): engine_options = {} options = popart.SessionOptions() options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enableFloatingPointChecks = args.floating_point_exceptions options.enableStochasticRounding = args.stochastic_rounding options.enableGroupedMatmuls = False options.enablePrefetchDatastreams = not args.minimum_latency_inference options.enableOutlining = not args.no_outlining partials_type = "half" if args.enable_half_partials else "float" options.partialsTypeMatMuls = partials_type options.convolutionOptions = {'partialsType': partials_type} if args.replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = args.replication_factor engine_options["target.syncReplicasIndependently"] = "true" # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops # such as add or reshapeInplace. # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation. options.outlineThreshold = 10.0 if args.execution_mode == "PIPELINE": options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline elif args.execution_mode == "PHASED": options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.enableOutliningCopyCostPruning = False options.outlineThreshold = -np.inf options.executionPhaseSettings.phases = model.total_execution_phases options.batchSerializationSettings.factor = args.batch_serialize options.autoRecomputation = popart.RecomputationType.Standard options.explicitRecomputation = True options.aliasZeroCopy = True options.activationTensorLocationSettings.location.storage = popart.TensorStorage.OffChip varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO varLocation.replicatedTensorSharding = ( popart.ReplicatedTensorSharding.On if args.replicated_weight_sharding else popart.ReplicatedTensorSharding.Off) options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.numIOTiles = args.num_io_tiles options.timeLimitScheduler = -1 options.swapLimitScheduler = -1 engine_options["target.syncReplicasIndependently"] = "false" if args.activations_on_chip: options.activationTensorLocationSettings = popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0) if args.optimizer_state_offchip: options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if args.gradient_accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = args.gradient_accumulation_factor if args.gradient_reduction_type == "Mean": options.accumulationReductionType = popart.ReductionType.Mean # When not replicated SyncPattern.SinglePipeline will provide better overlap # than this option. if args.optimizer_state_offchip and args.replication_factor > 1: options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized, [0]) if args.engine_cache is not None: options.enableEngineCaching = True options.cachePath = args.engine_cache if args.profile: options.enableEngineCaching = False options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count options.disableGradAccumulationTensorStreams = True if args.max_copy_merge_size == -1: logger.debug("No copy merge size limit applied") else: logger.warning( f"Copy merge size limit set to {args.max_copy_merge_size}") engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size) # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large # transposes before operations. if args.disable_fully_connected_pass: if args.task == "SQUAD" and args.sequence_length == 384: logger.warning( "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM." ) options.enableFullyConnectedPass = False if args.inference and args.engine_cache is not None and not args.variable_weights_inference: logger.warning( "Using engine cache with constant weights. Checkpoint weights will be ignored. " "Use the `--variable-weights-inference` flag if checkpoint weights should be used." ) if args.variable_weights_inference: options.constantWeights = False if args.group_host_syncs: options.groupHostSync = True if args.internal_exchange_optimisation_target is not None: engine_options["opt.internalExchangeOptimisationTarget"] = str( args.internal_exchange_optimisation_target) options.engineOptions = engine_options # Set synthetic data mode (if active) if args.synthetic_data: if args.synthetic_data_initializer == "zeros": options.syntheticDataMode = popart.SyntheticDataMode.Zeros else: options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal logger.info( f"Running with Synthetic Data Type '{options.syntheticDataMode}'") return options
opts = popart.SessionOptions() if args.profile: opts.engineOptions = { "autoReport.all": "true", "autoReport.directory": args.profile_dir } if phased_execution: # Constant weights cannot be streamed opts.constantWeights = False opts.executionPhaseSettings.phases = args.num_layers opts.executionPhaseSettings.stages = 2 opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases opts.numIOTiles = 128 varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO opts.weightTensorLocationSettings.location = varLocation else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual print("Compiling.") session = popart.InferenceSession(fnModel=proto, dataFlow=popart.DataFlow( args.batches_per_step, anchor_map), userOptions=opts, deviceInfo=device) session.prepareDevice()
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[str] = None, optimizer: Optional[popart.Optimizer] = None, patterns: Optional[popart.Patterns] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None, skip_execution: bool = False, execution_mode: str = 'DEFAULT', replication_factor: int = 1, replicated_tensor_sharding: bool = False, num_reps: int = 1): outputs = make_tuple(outputs) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.reportOptions = {"showVarStorage": "true"} if replicated_tensor_sharding: options.weightTensorLocationSettings.location.replicatedTensorSharding.On options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On if replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = replication_factor if execution_mode == 'PHASED': options.enableOutlining = True options.outlineThreshold = -np.inf options.enableOutliningCopyCostPruning = False options.autoRecomputation = popart.RecomputationType.Standard options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.explicitRecomputation = True options.aliasZeroCopy = True options.batchSerializationSettings.factor = user_options["batchSerializationFactor"] options.executionPhaseSettings.phases = user_options["executionPhases"] ipus = 1 options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.outlineSequenceBreakCost = 100000.0 options.batchSerializationSettings.concatOnVirtualGraphChange = False options.batchSerializationSettings.concatOnExecutionPhaseChange = False options.batchSerializationSettings.concatOnPipelineStageChange = False options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute options.autoRecomputation = popart.RecomputationType.Standard varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.activationTensorLocationSettings.location = varLocation options.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand options.executionPhaseSettings.weightIOSchedule = popart.ExecutionPhaseIOSchedule.Preload options.executionPhaseSettings.schedule = popart.ExecutionPhaseSchedule.Batch else: options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 for key, value in user_options.items(): if key not in ["batchSerializationFactor", "executionPhases"]: setattr(options, key, value) if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true", "opt.internalExchangeOptimisationTarget": "balanced", } replicas = user_options.get("replicatedGraphCount", 1) request_ipus = pow(2, math.ceil(math.log2(ipus * replicas))) request_ipus *= replication_factor dm = popart.DeviceManager() dm.setOnDemandAttachTimeout(int(1e4)) device = dm.acquireAvailableDevice( request_ipus, connectionType=popart.DeviceConnectionType.OnDemand, selectionCriterion=popart.DeviceSelectionCriterion.Random) if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, patterns=patterns) if skip_execution: device.detach() return session # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.OutOfMemoryException as e: if return_stats and log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) device.detach() raise e print("Compilation complete") session.weightsFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() rf = user_options.get("replicatedGraphCount") if rf is not None and rf > 1: data = {k: np.repeat(v[np.newaxis], rf, 0) for k, v in data.items()} # Add a gradient accumulation factor dimension if needed af = user_options.get("accumulationFactor") if af is not None and af > 1: data = {k: np.repeat(v[np.newaxis], af, 0) for k, v in data.items()} # Add a batches_per_step dimension if needed if batches_per_step > 1: data = {k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items()} for _ in range(num_reps): stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto