def test_accumulator_tensor_location_settings_plus_override(): # Check optimizer state tensor location settings work optimizer_with_state = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1'], check_offchip=['Accl___W2', 'Accl___W0']) ir = get_ir( accumulator_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip) }, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W2', 'Accl___W0'], check_offchip=['Accl___W1'])
def test_optimizer_state_tensor_location_settings(): # Check optimizer state tensor location settings work. optimizer_with_state = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.0, False), "defaultWeightDecay": (0.0, False), "defaultDampening": (0.0, True) }) ir = get_ir(optimizer_state_tensor_location_settings=None, optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'], check_offchip=[]) ir = get_ir( optimizer_state_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), optimizer=optimizer_with_state) check_ir(ir, check_onchip=[], check_offchip=['Accl___W1', 'Accl___W2', 'Accl___W0']) ir = get_ir( optimizer_state_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), optimizer=optimizer_with_state) check_ir(ir, check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'], check_offchip=[])
def test_activation_tensor_location_settings_plus_override(): # Check weight tensor location settings work. ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'MatMul:0/1__t6': popart.TensorLocation(popart.TensorStorage.OnChip) }) check_ir(ir, check_onchip=['MatMul:0/1__t6'], check_offchip=['MatMul:0__t3']) ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'MatMul:0/1__t6': popart.TensorLocation(popart.TensorStorage.OffChip) }) check_ir(ir, check_onchip=['MatMul:0__t3'], check_offchip=['MatMul:0/1__t6'])
def test_weight_tensor_location_settings(): # Check weight tensor location settings work. ir = get_ir(weight_tensor_location_settings=None) check_ir(ir, check_onchip=['W0', 'W1', 'W2'], check_offchip=[]) ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0)) check_ir(ir, check_onchip=[], check_offchip=['W0', 'W1', 'W2']) ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0)) check_ir(ir, check_onchip=['W0', 'W1', 'W2'], check_offchip=[])
def test_weight_tensor_location_settings_plus_override(): # Check weight tensor location settings work. ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0), tensor_location_setting_override={ 'W2': popart.TensorLocation(popart.TensorStorage.OnChip) }) check_ir(ir, check_onchip=['W2'], check_offchip=['W0', 'W1']) ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0), tensor_location_setting_override={ 'W1': popart.TensorLocation(popart.TensorStorage.OffChip) }) check_ir(ir, check_onchip=['W0', 'W2'], check_offchip=['W1'])
def session(splits=1): proto, data, x, loss = model(splits) user_options = { "enableOutlining": False, "enableGradientAccumulation": True, "accumulationFactor": 2, "optimizerStateTensorLocationSettings": popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0) } optimizer = popart.Adam({ "defaultLearningRate": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True) }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=popart.Patterns(), user_options=user_options, skip_execution=False)
def test_onchip_memory(tmpdir): onchip_settings = popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0) run_model(tmpdir, 'model_normal.onnx', execution_mode="normal") run_model(tmpdir, 'model_onchip_act.onnx', execution_mode="phased", activation_tensor_location_settings=onChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'model_onchip_weights.onnx', execution_mode="phased", activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=onChipLocation, optimizer_state_tensor_location_settings=offChipLocation, accumulator_tensor_location_settings=onChipLocation) run_model(tmpdir, 'model_onchip_opt_state.onnx', execution_mode="phased", activation_tensor_location_settings=offChipLocation, weight_tensor_location_settings=offChipLocation, optimizer_state_tensor_location_settings=onChipLocation, accumulator_tensor_location_settings=onChipLocation) normal = onnx.load(str(tmpdir / 'model_normal.onnx')) onchip_act = onnx.load(str(tmpdir / 'model_onchip_act.onnx')) onchip_weights = onnx.load(str(tmpdir / 'model_onchip_weights.onnx')) onchip_opt_state = onnx.load(str(tmpdir / 'model_onchip_opt_state.onnx')) check_model(normal, onchip_act) check_model(normal, onchip_weights) check_model(normal, onchip_opt_state)
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) patterns = popart.Patterns() patterns.enablePattern("TiedGatherPattern", include_patterns) patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, "accumulationAndReplicationReductionType": popart.ReductionType.Mean, "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running } if optim == "Lamb": optimizer = popart.Adam({ "defaultLearningRate": (0.1, False), "defaultWeightDecay": (0.1, True), "defaultBeta1": (0.1, True), "defaultBeta2": (0.1, True), "lossScaling": (20, True), }, mode=popart.AdamMode.LambNoBias) # NoBias to increase the error of incorrect gradients user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorLocation( popart.TensorStorage.OffChip, popart.ReplicatedTensorSharding.On), 0, 0) user_options["enableReplicatedGraphs"] = True user_options["replicatedGraphCount"] = 2 ipus = 2 else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, True)}) ipus = 1 if train: return run_py( proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py( proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)
def bert_optimizer_location_settings(args): storage = popart.TensorStorage.OnChip if args.optimizer_state_offchip: storage = popart.TensorStorage.OffChip rts = popart.ReplicatedTensorSharding.Off if args.replicated_tensor_sharding: rts = popart.ReplicatedTensorSharding.On return popart.TensorLocationSettings(popart.TensorLocation(storage, rts))
def test_activation_tensor_location_settings(): # Check weight tensor location settings work. ir = get_ir(num_layers=5, activation_tensor_location_settings=None) check_ir(ir, check_onchip=['MatMul:0/1__t6', 'MatMul:0__t3'], check_offchip=[]) ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0)) check_ir(ir, check_onchip=[], check_offchip=['MatMul:0/1__t6', 'MatMul:0__t3']) ir = get_ir( num_layers=5, activation_tensor_location_settings=popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0)) check_ir(ir, check_onchip=['MatMul:0/1__t6', 'MatMul:0__t3'], check_offchip=[])
def set_phased_options(options, engine_options, model, args): options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.enableOutliningCopyCostPruning = False options.outlineThreshold = -np.inf options.outlineSequenceBreakCost = 100000.0 options.executionPhaseSettings.phases = model.total_execution_phases options.batchSerializationSettings.factor = args.batch_serialize options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Fwd options.batchSerializationSettings.concatOnVirtualGraphChange = False options.batchSerializationSettings.concatOnExecutionPhaseChange = False options.batchSerializationSettings.concatOnPipelineStageChange = False options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute options.autoRecomputation = popart.RecomputationType.Standard options.explicitRecomputation = True options.aliasZeroCopy = True varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO varLocation.replicatedTensorSharding = ( popart.ReplicatedTensorSharding.On if args.replicated_tensor_sharding else popart.ReplicatedTensorSharding.Off) options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.activationTensorLocationSettings.location = varLocation if args.tensor_storage_onchip: options.weightTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.accumulatorTensorLocationSettings.location.storage = popart.TensorStorage.OnChip options.executionPhaseSettings.activationIOSchedule = io_schedule( args.activation_io_schedule) options.executionPhaseSettings.weightIOSchedule = io_schedule( args.weight_io_schedule) options.executionPhaseSettings.schedule = optimizer_schedule( args.optimizer_schedule) options.numIOTiles = args.num_io_tiles engine_options["target.syncReplicasIndependently"] = "false" if args.activations_on_chip: options.activationTensorLocationSettings = popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0)
def test_attention_streamingmemory(tmpdir): np.random.seed(0XDEAD1337) batches_per_step = 5 batch_size = 8 hidden_size = 16 sequence_length = 8 attention_heads = 4 qkv_length = hidden_size / attention_heads input_shape = [batch_size * sequence_length, hidden_size] mask_shape = [batch_size, 1, 1, sequence_length] qkv_data = np.random.normal( 0, 0.02, [hidden_size, hidden_size * 3]).astype(np.float32) r = np.arange(0, sequence_length) r = np.reshape(batch_size * [r], mask_shape) masks = [] for i in range(batches_per_step): masks.append(np.less(r, i).astype(np.float32)) mask_data = (1 - np.stack(masks)) * -1000.0 input_data = np.random.normal(0, 0.02, [batches_per_step] + input_shape).astype(np.float32) def run_test(index, options): per_replica_batch_size = batch_size / options["replication"] model_input_shape = input_shape[:] model_input_shape[0] = int(model_input_shape[0] / options["replication"]) model_mask_shape = mask_shape[:] model_mask_shape[0] = int(model_mask_shape[0] / options["replication"]) stride = 2 // options["stages"] if "stride" in options and options["stride"]: stride = options["stride"] builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) mask = builder.addInputTensor( popart.TensorInfo("FLOAT", model_mask_shape), "mask") x_in = builder.addInputTensor( popart.TensorInfo("FLOAT", model_input_shape), "x_in") anchors = {} x = x_in for i in range(options["numLayers"]): qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}") anchors[popart.reservedGradientPrefix() + qkv] = popart.AnchorReturnType("All") vgid = (i % options["stages"]) if options["phasedExecution"] else i with builder.virtualGraph(vgid), builder.executionPhase(i * stride): x = builder.aiOnnx.matmul([x, qkv]) x = attention_onnx(builder, x, mask, per_replica_batch_size, sequence_length, hidden_size, attention_heads, qkv_length) vgid = ((options["numLayers"] - 1) % options["stages"] ) if options["phasedExecution"] else options["numLayers"] - 1 with builder.virtualGraph(vgid), builder.executionPhase( (options["numLayers"] - 1) * stride): l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum) proto = builder.getModelProto() gradient_keys = list(anchors.keys()) anchors[x] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batches_per_step, anchors) opts = popart.SessionOptions() opts.executionPhaseSettings.stages = options["stages"] opts.executionPhaseSettings.phases = ( options["numLayers"] * stride if options["phasedExecution"] else 0) opts.enableOutlining = options["outlining"] if "phaseSchedule" in options: opts.executionPhaseSettings.schedule = options["phaseSchedule"] # Phased execution currently does its own recompute annotations opts.autoRecomputation = (popart.RecomputationType.Standard if options["explicitRecomputation"] else popart.RecomputationType.NoRecompute) opts.outlineThreshold = -np.inf opts.enableOutliningCopyCostPruning = False opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases if options["phasedExecution"] else popart.VirtualGraphMode.Manual) opts.explicitRecomputation = options["explicitRecomputation"] opts.aliasZeroCopy = options["aliasZeroCopy"] opts.batchSerializationSettings.factor = options["batchSerialize"] if "batchSchedule" in options: opts.batchSerializationSettings.batchSchedule = options[ "batchSchedule"] if "batchConcat" in options: # Do not concatenate the batch across phases and virtual graphs # (causes more, smalle transfers but allows for individual sub-batch # elements to be transferred) opts.batchSerializationSettings.concatOnVirtualGraphChange = options[ "batchConcat"] opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[ "batchConcat"] # Wait with loading activations until they are required opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand if "tensorLocationSettings" in options and options[ "tensorLocationSettings"]: opts.activationTensorLocationSettings = options[ "tensorLocationSettings"] opts.weightTensorLocationSettings = options[ "tensorLocationSettings"] opts.optimizerStateTensorLocationSettings = options[ "tensorLocationSettings"] opts.accumulatorTensorLocationSettings = options[ "tensorLocationSettings"] if "weightTensorLocationSettings" in options and options[ "weightTensorLocationSettings"]: opts.weightTensorLocationSettings = options[ "weightTensorLocationSettings"] if options["replication"] > 1: opts.replicatedGraphCount = options["replication"] opts.enableReplicatedGraphs = True if "ioTiles" in options: opts.numIOTiles = options["ioTiles"] pat = popart.Patterns(popart.PatternsLevel.Default) if options["phasedExecution"]: numIpus = options["stages"] else: numIpus = options["numLayers"] + 1 if options["replication"] > 1: numIpus = numIpus * options["replication"] device = tu.create_test_device(numIpus, pattern=popart.SyncPattern.Full) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.ConstSGD(0.1), patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for k, v in anchors.items(): print(f"anchor_before {k}={v.shape}") inputs = {x_in: input_data, mask: mask_data} stepio = popart.PyStepIO(inputs, anchors) for __ in range(10): session.run(stepio) session.modelToHost( str(tmpdir / f"streamingmemory_attention_{index}.onnx")) if options["replication"] > 1: for k, v in anchors.items(): if k in gradient_keys: # The gradient anchors will have an additional replication axis. anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0) else: # Output tensor needs reshaping. anchors[k] = np.reshape(anchors[k], [ batches_per_step, sequence_length * batch_size, hidden_size ]) for k, v in anchors.items(): print(f"anchor_after {k}={v.shape}") return anchors test_results = [] # AliasZeroCopy only supported with explicit recomputation, but not with # standard recomputation # Phased execution only supported with explicit recomputaton, but not with # standard recomputation test_variants = [] defaultOffChip = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) ioOffChip = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) # Ground truth variant test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": False, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": False, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 4, "replication": 1, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 1, "replication": 1, "tensorLocationSettings": defaultOffChip, }) # Test batch serialized single device per replica execution, where all # streaming memory traffic goes through IO tiles, and activations are # stored and loaded one-by-one test_variants.append({ "stages": 1, "stride": 4, "numLayers": 3, "phasedExecution": True, "outlining": True, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchConcat": False, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test batch serialized single device per replica execution, where all # streaming memory traffic goes through IO tiles, and loading of the next # phase happens before storing the current phase test_variants.append({ "stages": 1, "stride": 1, "numLayers": 3, "phasedExecution": True, "phaseSchedule": popart.ExecutionPhaseSchedule.BatchClusteredIO, "outlining": False, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchConcat": True, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test a variety of batch serialisation schedules. for batchSchedule in [ popart.BatchSerializationBatchSchedule.Scheduler, popart.BatchSerializationBatchSchedule.Isomorphic, popart.BatchSerializationBatchSchedule.OverlapOnIo, popart.BatchSerializationBatchSchedule.OverlapOnCompute, ]: test_variants.append({ "stages": 1, "stride": 4, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": True, "aliasZeroCopy": True, "batchSerialize": 4, "batchSchedule": batchSchedule, "batchConcat": False, "replication": 2, "tensorLocationSettings": ioOffChip, "ioTiles": 192 }) # Test replicated tensor sharding + on chip (no outlining). test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 2, "tensorLocationSettings": defaultOffChip, "weightTensorLocationSettings": popart.TensorLocationSettings(location=popart.TensorLocation( storage=popart.TensorStorage.OnChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) }) # Test replicated tensor sharding + off chip (no outlining). test_variants.append({ "stages": 2, "numLayers": 3, "phasedExecution": True, "outlining": False, "explicitRecomputation": False, "aliasZeroCopy": False, "batchSerialize": 1, "replication": 2, "tensorLocationSettings": defaultOffChip, "weightTensorLocationSettings": popart.TensorLocationSettings(location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) }) index = 0 for test_option in test_variants: print(f"Running {index}: {test_option}") test_results.append(run_test(index, test_option)) index += 1 gt_onnx = onnx.load(str(tmpdir / f"streamingmemory_attention_0.onnx")) for i in range(1, index): print(f"Testing run {i}: {test_variants[i]}") for key in test_results[0].keys(): assert np.all( np.isclose(test_results[0][key], test_results[i][key], equal_nan=False)) val_onnx = onnx.load( str(tmpdir / f"streamingmemory_attention_{i}.onnx")) for j in range(len(gt_onnx.graph.initializer)): print(f"Checking initializer {j}") gt = gt_onnx.graph.initializer[j] gt = numpy_helper.to_array(gt) val = val_onnx.graph.initializer[j] val = numpy_helper.to_array(val) assert np.allclose(gt, val, equal_nan=False)
for i in range(len(lhs_model.graph.initializer)): lhs = lhs_model.graph.initializer[i] for j in range(len(rhs_model.graph.initializer)): rhs = rhs_model.graph.initializer[j] if (rhs.name == lhs.name): print(f'Checking initializer {i} ({lhs.name} - {rhs.name})') lhsa = numpy_helper.to_array(lhs) rhsa = numpy_helper.to_array(rhs) assert np.allclose(lhsa, rhsa, rtol=1.e-4, atol=1.e-5) # Standard OnChip settings onChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OnChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) # Standard OffChip settings offChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.Compute, storageTileSet=popart.TileSet.Compute, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) # Replicated tensor sharding OffChip settings
def run_model(tmpdir, model_file_name, schedule=popart.ExecutionPhaseSchedule.Interleaving, enable_outlining=False, stride=1, num_layers=5, dsize=128, batch_size=4, batch_serialize=1, batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic, num_iterations=5, num_replicas=2, optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})): np.random.seed(52125) builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [batch_size, dsize, dsize])) def add_layer(index, in_id): w = builder.addInitializedInputTensor( np.random.rand(dsize, dsize).astype(np.float32), f"W{index}") matmul_id = builder.aiOnnx.matmul([in_id, w]) return matmul_id out = ip l1 = "" final_loss = "" for i in range(num_layers): vgid = 0 with builder.executionPhase(i * stride), builder.virtualGraph(vgid): for j in range(3): out = add_layer(i, out) if i == num_layers - 1: with builder.executionPhase(i * stride), builder.virtualGraph(vgid): l1 = builder.aiGraphcore.l1loss([out], 0.1, popart.ReductionType.Sum) final_loss = builder.aiGraphcore.identityloss([l1]) anchorIds = [] builder.addOutputTensor(out) num_ipus = 1 dfAnchors = {} for anchorId in anchorIds: dfAnchors.update({anchorId: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() # Cycle counting opts.instrumentWithHardwareCycleCounter = True # Outlining opts.enableOutlining = enable_outlining opts.enableOutliningCopyCostPruning = False opts.outlineThreshold = -np.inf opts.aliasZeroCopy = enable_outlining # Replicated graphs opts.replicatedGraphCount = num_replicas opts.enableReplicatedGraphs = True if num_replicas > 1 else False # IO tiles opts.numIOTiles = 192 # Phased execution opts.executionPhaseSettings.phases = num_layers * stride opts.executionPhaseSettings.stages = 1 opts.executionPhaseSettings.schedule = schedule opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases # Recomputation opts.autoRecomputation = popart.RecomputationType.Standard opts.explicitRecomputation = True # Batch serialization if batch_serialize > 1: opts.batchSerializationSettings.factor = batch_serialize opts.batchSerializationSettings.concatOnVirtualGraphChange = False opts.batchSerializationSettings.concatOnExecutionPhaseChange = False opts.batchSerializationSettings.concatOnPipelineStageChange = False opts.batchSerializationSettings.batchSchedule = batch_schedule # Related execution phase setting opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand # Streaming memory offChipLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.Off), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) offChipRtsLocation = popart.TensorLocationSettings( location=popart.TensorLocation( storage=popart.TensorStorage.OffChip, loadTileSet=popart.TileSet.IO, storageTileSet=popart.TileSet.IO, replicatedTensorSharding=popart.ReplicatedTensorSharding.On), minElementsForOffChip=0, minElementsForReplicatedTensorSharding=2) opts.activationTensorLocationSettings = offChipLocation opts.weightTensorLocationSettings = offChipRtsLocation opts.optimizerStateTensorLocationSettings = offChipRtsLocation proto = builder.getModelProto() with tu.create_test_device(num_replicas * num_ipus, pattern=popart.SyncPattern.Full) as device: session = popart.TrainingSession( fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), optimizer=optimizer, loss=final_loss, patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for i in range(num_iterations): ip_data = np.random.rand(num_replicas, batch_size, dsize, dsize).astype(np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) cycles = session.getCycleCount() print("anchors:") print(anchors) session.modelToHost(str(tmpdir / model_file_name)) return cycles
def bert_session_options(args, model): engine_options = {} options = popart.SessionOptions() options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enableFloatingPointChecks = args.floating_point_exceptions options.enableStochasticRounding = args.stochastic_rounding options.enableGroupedMatmuls = False options.enablePrefetchDatastreams = not args.minimum_latency_inference options.enableOutlining = not args.no_outlining partials_type = "half" if args.enable_half_partials else "float" options.partialsTypeMatMuls = partials_type options.convolutionOptions = {'partialsType': partials_type} if args.replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = args.replication_factor engine_options["target.syncReplicasIndependently"] = "true" # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops # such as add or reshapeInplace. # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation. options.outlineThreshold = 10.0 if args.execution_mode == "PIPELINE": options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline elif args.execution_mode == "PHASED": options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases options.enableOutliningCopyCostPruning = False options.outlineThreshold = -np.inf options.executionPhaseSettings.phases = model.total_execution_phases options.batchSerializationSettings.factor = args.batch_serialize options.autoRecomputation = popart.RecomputationType.Standard options.explicitRecomputation = True options.aliasZeroCopy = True options.activationTensorLocationSettings.location.storage = popart.TensorStorage.OffChip varLocation = popart.TensorLocation() varLocation.storage = popart.TensorStorage.OffChip varLocation.loadTileSet = popart.TileSet.IO varLocation.storageTileSet = popart.TileSet.IO varLocation.replicatedTensorSharding = ( popart.ReplicatedTensorSharding.On if args.replicated_weight_sharding else popart.ReplicatedTensorSharding.Off) options.weightTensorLocationSettings.location = varLocation options.optimizerStateTensorLocationSettings.location = varLocation options.accumulatorTensorLocationSettings.location = varLocation options.numIOTiles = args.num_io_tiles options.timeLimitScheduler = -1 options.swapLimitScheduler = -1 engine_options["target.syncReplicasIndependently"] = "false" if args.activations_on_chip: options.activationTensorLocationSettings = popart.TensorLocationSettings( popart.TensorStorage.OnChip, 0) if args.optimizer_state_offchip: options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if args.gradient_accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = args.gradient_accumulation_factor if args.gradient_reduction_type == "Mean": options.accumulationReductionType = popart.ReductionType.Mean # When not replicated SyncPattern.SinglePipeline will provide better overlap # than this option. if args.optimizer_state_offchip and args.replication_factor > 1: options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings( popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized, [0]) if args.engine_cache is not None: options.enableEngineCaching = True options.cachePath = args.engine_cache if args.profile: options.enableEngineCaching = False options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count options.disableGradAccumulationTensorStreams = True if args.max_copy_merge_size == -1: logger.debug("No copy merge size limit applied") else: logger.warning( f"Copy merge size limit set to {args.max_copy_merge_size}") engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size) # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large # transposes before operations. if args.disable_fully_connected_pass: if args.task == "SQUAD" and args.sequence_length == 384: logger.warning( "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM." ) options.enableFullyConnectedPass = False if args.inference and args.engine_cache is not None and not args.variable_weights_inference: logger.warning( "Using engine cache with constant weights. Checkpoint weights will be ignored. " "Use the `--variable-weights-inference` flag if checkpoint weights should be used." ) if args.variable_weights_inference: options.constantWeights = False if args.group_host_syncs: options.groupHostSync = True if args.internal_exchange_optimisation_target is not None: engine_options["opt.internalExchangeOptimisationTarget"] = str( args.internal_exchange_optimisation_target) options.engineOptions = engine_options # Set synthetic data mode (if active) if args.synthetic_data: if args.synthetic_data_initializer == "zeros": options.syntheticDataMode = popart.SyntheticDataMode.Zeros else: options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal logger.info( f"Running with Synthetic Data Type '{options.syntheticDataMode}'") return options
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"): proto, data, x, loss = model(splits=splits) # Required extraPatterns = [] if include_patterns: extraPatterns += ["TiedGatherPattern", "TiedGatherAccumulatePattern"] patterns = popart.Patterns() for extraPattern in extraPatterns: patterns.enablePattern(extraPattern, True) user_options = { "enableOutlining": outline, "enableGradientAccumulation": True, "accumulationFactor": 2, } if optim == "Lamb": optimizer = popart.Adam( { "defaultLearningRate": (0.1, True), "lossScaling": (20, False), }, mode=popart.AdamMode.LambNoBias ) # NoBias to increase the error of incorrect gradients user_options[ "optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings( popart.TensorStorage.OffChip, 0) else: optimizer = popart.SGD({ "defaultLearningRate": (0.1, True), "defaultMomentum": (0.9, True), "defaultDampening": (0, True), # 0 dampening to increase the error of incorrect gradients "lossScaling": (20, False) }) if train: return run_py(proto, data=data, outputs=x, loss=loss, optimizer=optimizer, patterns=patterns, user_options=user_options, skip_execution=skip_execution) else: return run_py(proto, data=data, outputs=x, patterns=patterns, user_options={ "enableOutlining": outline, "constantWeights": False }, skip_execution=skip_execution)