def test_accumulator_tensor_location_settings_plus_override():
    # Check optimizer state tensor location settings work
    optimizer_with_state = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.0, False),
        "defaultWeightDecay": (0.0, False),
        "defaultDampening": (0.0, True)
    })
    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1'],
             check_offchip=['Accl___W2', 'Accl___W0'])

    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W2', 'Accl___W0'],
             check_offchip=['Accl___W1'])
def test_optimizer_state_tensor_location_settings():
    # Check optimizer state tensor location settings work.
    optimizer_with_state = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.0, False),
        "defaultWeightDecay": (0.0, False),
        "defaultDampening": (0.0, True)
    })
    ir = get_ir(optimizer_state_tensor_location_settings=None,
                optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'],
             check_offchip=[])

    ir = get_ir(
        optimizer_state_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=[],
             check_offchip=['Accl___W1', 'Accl___W2', 'Accl___W0'])

    ir = get_ir(
        optimizer_state_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'],
             check_offchip=[])
def test_activation_tensor_location_settings_plus_override():
    # Check weight tensor location settings work.
    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        tensor_location_setting_override={
            'MatMul:0/1__t6':
            popart.TensorLocation(popart.TensorStorage.OnChip)
        })
    check_ir(ir,
             check_onchip=['MatMul:0/1__t6'],
             check_offchip=['MatMul:0__t3'])

    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        tensor_location_setting_override={
            'MatMul:0/1__t6':
            popart.TensorLocation(popart.TensorStorage.OffChip)
        })
    check_ir(ir,
             check_onchip=['MatMul:0__t3'],
             check_offchip=['MatMul:0/1__t6'])
def test_weight_tensor_location_settings():
    # Check weight tensor location settings work.
    ir = get_ir(weight_tensor_location_settings=None)
    check_ir(ir, check_onchip=['W0', 'W1', 'W2'], check_offchip=[])

    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OffChip, 0))
    check_ir(ir, check_onchip=[], check_offchip=['W0', 'W1', 'W2'])

    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OnChip, 0))
    check_ir(ir, check_onchip=['W0', 'W1', 'W2'], check_offchip=[])
def test_weight_tensor_location_settings_plus_override():
    # Check weight tensor location settings work.
    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OffChip, 0),
                tensor_location_setting_override={
                    'W2': popart.TensorLocation(popart.TensorStorage.OnChip)
                })
    check_ir(ir, check_onchip=['W2'], check_offchip=['W0', 'W1'])

    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OnChip, 0),
                tensor_location_setting_override={
                    'W1': popart.TensorLocation(popart.TensorStorage.OffChip)
                })
    check_ir(ir, check_onchip=['W0', 'W2'], check_offchip=['W1'])
def session(splits=1):
    proto, data, x, loss = model(splits)

    user_options = {
        "enableOutlining": False,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "optimizerStateTensorLocationSettings": popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0)
    }

    optimizer = popart.Adam({
        "defaultLearningRate": (0.1, True),
        "defaultBeta1": (0.1, True),
        "defaultBeta2": (0.1, True)
    }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients

    return run_py(
        proto,
        data=data,
        outputs=x,
        loss=loss,
        optimizer=optimizer,
        patterns=popart.Patterns(),
        user_options=user_options,
        skip_execution=False)
示例#7
0
def test_onchip_memory(tmpdir):
    onchip_settings = popart.TensorLocationSettings(
        popart.TensorStorage.OnChip, 0)
    run_model(tmpdir, 'model_normal.onnx', execution_mode="normal")
    run_model(tmpdir,
              'model_onchip_act.onnx',
              execution_mode="phased",
              activation_tensor_location_settings=onChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'model_onchip_weights.onnx',
              execution_mode="phased",
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=onChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=onChipLocation)
    run_model(tmpdir,
              'model_onchip_opt_state.onnx',
              execution_mode="phased",
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=onChipLocation,
              accumulator_tensor_location_settings=onChipLocation)

    normal = onnx.load(str(tmpdir / 'model_normal.onnx'))
    onchip_act = onnx.load(str(tmpdir / 'model_onchip_act.onnx'))
    onchip_weights = onnx.load(str(tmpdir / 'model_onchip_weights.onnx'))
    onchip_opt_state = onnx.load(str(tmpdir / 'model_onchip_opt_state.onnx'))

    check_model(normal, onchip_act)
    check_model(normal, onchip_weights)
    check_model(normal, onchip_opt_state)
示例#8
0
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    patterns = popart.Patterns()
    patterns.enablePattern("TiedGatherPattern", include_patterns)
    patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "accumulationAndReplicationReductionType": popart.ReductionType.Mean,
        "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running
    }

    if optim == "Lamb":
        optimizer = popart.Adam({
            "defaultLearningRate": (0.1, False),
            "defaultWeightDecay": (0.1, True),
            "defaultBeta1": (0.1, True),
            "defaultBeta2": (0.1, True),
            "lossScaling": (20, True),
        }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients
        user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
            popart.TensorLocation(
                popart.TensorStorage.OffChip,
                popart.ReplicatedTensorSharding.On),
            0, 0)
        user_options["enableReplicatedGraphs"] = True
        user_options["replicatedGraphCount"] = 2
        ipus = 2
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, True)})
        ipus = 1

    if train:
        return run_py(
            proto,
            data=data,
            outputs=x,
            loss=loss,
            optimizer=optimizer,
            patterns=patterns,
            user_options=user_options,
            skip_execution=skip_execution)
    else:
        return run_py(
            proto,
            data=data,
            outputs=x,
            patterns=patterns,
            user_options={
                "enableOutlining": outline,
                "constantWeights": False
            },
            skip_execution=skip_execution)
示例#9
0
def bert_optimizer_location_settings(args):
    storage = popart.TensorStorage.OnChip
    if args.optimizer_state_offchip:
        storage = popart.TensorStorage.OffChip
    rts = popart.ReplicatedTensorSharding.Off
    if args.replicated_tensor_sharding:
        rts = popart.ReplicatedTensorSharding.On

    return popart.TensorLocationSettings(popart.TensorLocation(storage, rts))
def test_activation_tensor_location_settings():
    # Check weight tensor location settings work.
    ir = get_ir(num_layers=5, activation_tensor_location_settings=None)
    check_ir(ir,
             check_onchip=['MatMul:0/1__t6', 'MatMul:0__t3'],
             check_offchip=[])

    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0))
    check_ir(ir,
             check_onchip=[],
             check_offchip=['MatMul:0/1__t6', 'MatMul:0__t3'])

    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0))
    check_ir(ir,
             check_onchip=['MatMul:0/1__t6', 'MatMul:0__t3'],
             check_offchip=[])
示例#11
0
def set_phased_options(options, engine_options, model, args):
    options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
    options.enableOutliningCopyCostPruning = False
    options.outlineThreshold = -np.inf
    options.outlineSequenceBreakCost = 100000.0
    options.executionPhaseSettings.phases = model.total_execution_phases
    options.batchSerializationSettings.factor = args.batch_serialize
    options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Fwd
    options.batchSerializationSettings.concatOnVirtualGraphChange = False
    options.batchSerializationSettings.concatOnExecutionPhaseChange = False
    options.batchSerializationSettings.concatOnPipelineStageChange = False
    options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute
    options.autoRecomputation = popart.RecomputationType.Standard
    options.explicitRecomputation = True
    options.aliasZeroCopy = True

    varLocation = popart.TensorLocation()
    varLocation.storage = popart.TensorStorage.OffChip
    varLocation.loadTileSet = popart.TileSet.IO
    varLocation.storageTileSet = popart.TileSet.IO
    varLocation.replicatedTensorSharding = (
        popart.ReplicatedTensorSharding.On if args.replicated_tensor_sharding
        else popart.ReplicatedTensorSharding.Off)

    options.weightTensorLocationSettings.location = varLocation
    options.optimizerStateTensorLocationSettings.location = varLocation
    options.accumulatorTensorLocationSettings.location = varLocation
    options.activationTensorLocationSettings.location = varLocation

    if args.tensor_storage_onchip:
        options.weightTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
        options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
        options.accumulatorTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
    options.executionPhaseSettings.activationIOSchedule = io_schedule(
        args.activation_io_schedule)
    options.executionPhaseSettings.weightIOSchedule = io_schedule(
        args.weight_io_schedule)
    options.executionPhaseSettings.schedule = optimizer_schedule(
        args.optimizer_schedule)
    options.numIOTiles = args.num_io_tiles
    engine_options["target.syncReplicasIndependently"] = "false"
    if args.activations_on_chip:
        options.activationTensorLocationSettings = popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0)
def test_attention_streamingmemory(tmpdir):
    np.random.seed(0XDEAD1337)
    batches_per_step = 5
    batch_size = 8
    hidden_size = 16
    sequence_length = 8
    attention_heads = 4
    qkv_length = hidden_size / attention_heads

    input_shape = [batch_size * sequence_length, hidden_size]
    mask_shape = [batch_size, 1, 1, sequence_length]

    qkv_data = np.random.normal(
        0, 0.02, [hidden_size, hidden_size * 3]).astype(np.float32)

    r = np.arange(0, sequence_length)
    r = np.reshape(batch_size * [r], mask_shape)
    masks = []
    for i in range(batches_per_step):
        masks.append(np.less(r, i).astype(np.float32))
    mask_data = (1 - np.stack(masks)) * -1000.0

    input_data = np.random.normal(0, 0.02, [batches_per_step] +
                                  input_shape).astype(np.float32)

    def run_test(index, options):
        per_replica_batch_size = batch_size / options["replication"]
        model_input_shape = input_shape[:]
        model_input_shape[0] = int(model_input_shape[0] /
                                   options["replication"])
        model_mask_shape = mask_shape[:]
        model_mask_shape[0] = int(model_mask_shape[0] / options["replication"])

        stride = 2 // options["stages"]
        if "stride" in options and options["stride"]:
            stride = options["stride"]

        builder = popart.Builder(opsets={
            "ai.onnx": 9,
            "ai.onnx.ml": 1,
            "ai.graphcore": 1
        })

        mask = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_mask_shape), "mask")
        x_in = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_input_shape), "x_in")

        anchors = {}
        x = x_in
        for i in range(options["numLayers"]):
            qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}")
            anchors[popart.reservedGradientPrefix() +
                    qkv] = popart.AnchorReturnType("All")

            vgid = (i % options["stages"]) if options["phasedExecution"] else i

            with builder.virtualGraph(vgid), builder.executionPhase(i *
                                                                    stride):
                x = builder.aiOnnx.matmul([x, qkv])
                x = attention_onnx(builder, x, mask, per_replica_batch_size,
                                   sequence_length, hidden_size,
                                   attention_heads, qkv_length)

        vgid = ((options["numLayers"] - 1) % options["stages"]
                ) if options["phasedExecution"] else options["numLayers"] - 1

        with builder.virtualGraph(vgid), builder.executionPhase(
            (options["numLayers"] - 1) * stride):
            l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum)

        proto = builder.getModelProto()

        gradient_keys = list(anchors.keys())
        anchors[x] = popart.AnchorReturnType("All")

        dataFlow = popart.DataFlow(batches_per_step, anchors)

        opts = popart.SessionOptions()
        opts.executionPhaseSettings.stages = options["stages"]

        opts.executionPhaseSettings.phases = (
            options["numLayers"] * stride if options["phasedExecution"] else 0)
        opts.enableOutlining = options["outlining"]

        if "phaseSchedule" in options:
            opts.executionPhaseSettings.schedule = options["phaseSchedule"]

        # Phased execution currently does its own recompute annotations
        opts.autoRecomputation = (popart.RecomputationType.Standard
                                  if options["explicitRecomputation"] else
                                  popart.RecomputationType.NoRecompute)

        opts.outlineThreshold = -np.inf
        opts.enableOutliningCopyCostPruning = False
        opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases
                                 if options["phasedExecution"] else
                                 popart.VirtualGraphMode.Manual)
        opts.explicitRecomputation = options["explicitRecomputation"]
        opts.aliasZeroCopy = options["aliasZeroCopy"]

        opts.batchSerializationSettings.factor = options["batchSerialize"]
        if "batchSchedule" in options:
            opts.batchSerializationSettings.batchSchedule = options[
                "batchSchedule"]
        if "batchConcat" in options:
            # Do not concatenate the batch across phases and virtual graphs
            # (causes more, smalle transfers but allows for individual sub-batch
            # elements to be transferred)
            opts.batchSerializationSettings.concatOnVirtualGraphChange = options[
                "batchConcat"]
            opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[
                "batchConcat"]
            # Wait with loading activations until they are required
            opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

        if "tensorLocationSettings" in options and options[
                "tensorLocationSettings"]:
            opts.activationTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.weightTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.optimizerStateTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.accumulatorTensorLocationSettings = options[
                "tensorLocationSettings"]
        if "weightTensorLocationSettings" in options and options[
                "weightTensorLocationSettings"]:
            opts.weightTensorLocationSettings = options[
                "weightTensorLocationSettings"]
        if options["replication"] > 1:
            opts.replicatedGraphCount = options["replication"]
            opts.enableReplicatedGraphs = True
        if "ioTiles" in options:
            opts.numIOTiles = options["ioTiles"]

        pat = popart.Patterns(popart.PatternsLevel.Default)
        if options["phasedExecution"]:
            numIpus = options["stages"]
        else:
            numIpus = options["numLayers"] + 1
        if options["replication"] > 1:
            numIpus = numIpus * options["replication"]
        device = tu.create_test_device(numIpus,
                                       pattern=popart.SyncPattern.Full)

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.ConstSGD(0.1),
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()
        for k, v in anchors.items():
            print(f"anchor_before {k}={v.shape}")

        inputs = {x_in: input_data, mask: mask_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for __ in range(10):
            session.run(stepio)

        session.modelToHost(
            str(tmpdir / f"streamingmemory_attention_{index}.onnx"))

        if options["replication"] > 1:
            for k, v in anchors.items():
                if k in gradient_keys:
                    # The gradient anchors will have an additional replication axis.
                    anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0)
                else:
                    # Output tensor needs reshaping.
                    anchors[k] = np.reshape(anchors[k], [
                        batches_per_step, sequence_length * batch_size,
                        hidden_size
                    ])
            for k, v in anchors.items():
                print(f"anchor_after {k}={v.shape}")

        return anchors

    test_results = []

    # AliasZeroCopy only supported with explicit recomputation, but not with
    # standard recomputation
    # Phased execution only supported with explicit recomputaton, but not with
    # standard recomputation

    test_variants = []

    defaultOffChip = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    ioOffChip = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    # Ground truth variant
    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": False,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": False,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 4,
        "replication": 1,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    # Test batch serialized single device per replica execution, where all
    # streaming memory traffic goes through IO tiles, and activations are
    # stored and loaded one-by-one
    test_variants.append({
        "stages": 1,
        "stride": 4,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 4,
        "batchConcat": False,
        "replication": 2,
        "tensorLocationSettings": ioOffChip,
        "ioTiles": 192
    })

    # Test batch serialized single device per replica execution, where all
    # streaming memory traffic goes through IO tiles, and loading of the next
    # phase happens before storing the current phase
    test_variants.append({
        "stages": 1,
        "stride": 1,
        "numLayers": 3,
        "phasedExecution": True,
        "phaseSchedule": popart.ExecutionPhaseSchedule.BatchClusteredIO,
        "outlining": False,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 4,
        "batchConcat": True,
        "replication": 2,
        "tensorLocationSettings": ioOffChip,
        "ioTiles": 192
    })

    # Test a variety of batch serialisation schedules.
    for batchSchedule in [
            popart.BatchSerializationBatchSchedule.Scheduler,
            popart.BatchSerializationBatchSchedule.Isomorphic,
            popart.BatchSerializationBatchSchedule.OverlapOnIo,
            popart.BatchSerializationBatchSchedule.OverlapOnCompute,
    ]:

        test_variants.append({
            "stages": 1,
            "stride": 4,
            "numLayers": 3,
            "phasedExecution": True,
            "outlining": False,
            "explicitRecomputation": True,
            "aliasZeroCopy": True,
            "batchSerialize": 4,
            "batchSchedule": batchSchedule,
            "batchConcat": False,
            "replication": 2,
            "tensorLocationSettings": ioOffChip,
            "ioTiles": 192
        })

    # Test replicated tensor sharding + on chip (no outlining).
    test_variants.append({
        "stages":
        2,
        "numLayers":
        3,
        "phasedExecution":
        True,
        "outlining":
        False,
        "explicitRecomputation":
        False,
        "aliasZeroCopy":
        False,
        "batchSerialize":
        1,
        "replication":
        2,
        "tensorLocationSettings":
        defaultOffChip,
        "weightTensorLocationSettings":
        popart.TensorLocationSettings(location=popart.TensorLocation(
            storage=popart.TensorStorage.OnChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
                                      minElementsForOffChip=0,
                                      minElementsForReplicatedTensorSharding=2)
    })

    # Test replicated tensor sharding + off chip (no outlining).
    test_variants.append({
        "stages":
        2,
        "numLayers":
        3,
        "phasedExecution":
        True,
        "outlining":
        False,
        "explicitRecomputation":
        False,
        "aliasZeroCopy":
        False,
        "batchSerialize":
        1,
        "replication":
        2,
        "tensorLocationSettings":
        defaultOffChip,
        "weightTensorLocationSettings":
        popart.TensorLocationSettings(location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
                                      minElementsForOffChip=0,
                                      minElementsForReplicatedTensorSharding=2)
    })

    index = 0
    for test_option in test_variants:
        print(f"Running {index}: {test_option}")
        test_results.append(run_test(index, test_option))
        index += 1

    gt_onnx = onnx.load(str(tmpdir / f"streamingmemory_attention_0.onnx"))

    for i in range(1, index):
        print(f"Testing run {i}: {test_variants[i]}")
        for key in test_results[0].keys():
            assert np.all(
                np.isclose(test_results[0][key],
                           test_results[i][key],
                           equal_nan=False))

        val_onnx = onnx.load(
            str(tmpdir / f"streamingmemory_attention_{i}.onnx"))
        for j in range(len(gt_onnx.graph.initializer)):
            print(f"Checking initializer {j}")
            gt = gt_onnx.graph.initializer[j]
            gt = numpy_helper.to_array(gt)
            val = val_onnx.graph.initializer[j]
            val = numpy_helper.to_array(val)
            assert np.allclose(gt, val, equal_nan=False)
示例#13
0
    for i in range(len(lhs_model.graph.initializer)):
        lhs = lhs_model.graph.initializer[i]
        for j in range(len(rhs_model.graph.initializer)):
            rhs = rhs_model.graph.initializer[j]
            if (rhs.name == lhs.name):
                print(f'Checking initializer {i} ({lhs.name} - {rhs.name})')
                lhsa = numpy_helper.to_array(lhs)
                rhsa = numpy_helper.to_array(rhs)
                assert np.allclose(lhsa, rhsa, rtol=1.e-4, atol=1.e-5)


# Standard OnChip settings
onChipLocation = popart.TensorLocationSettings(
    location=popart.TensorLocation(
        storage=popart.TensorStorage.OnChip,
        loadTileSet=popart.TileSet.Compute,
        storageTileSet=popart.TileSet.Compute,
        replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
    minElementsForOffChip=0,
    minElementsForReplicatedTensorSharding=2)

# Standard OffChip settings
offChipLocation = popart.TensorLocationSettings(
    location=popart.TensorLocation(
        storage=popart.TensorStorage.OffChip,
        loadTileSet=popart.TileSet.Compute,
        storageTileSet=popart.TileSet.Compute,
        replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
    minElementsForOffChip=0,
    minElementsForReplicatedTensorSharding=2)

# Replicated tensor sharding OffChip settings
def run_model(tmpdir,
              model_file_name,
              schedule=popart.ExecutionPhaseSchedule.Interleaving,
              enable_outlining=False,
              stride=1,
              num_layers=5,
              dsize=128,
              batch_size=4,
              batch_serialize=1,
              batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic,
              num_iterations=5,
              num_replicas=2,
              optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})):

    np.random.seed(52125)

    builder = popart.Builder()
    ip = builder.addInputTensor(
        popart.TensorInfo("FLOAT", [batch_size, dsize, dsize]))

    def add_layer(index, in_id):
        w = builder.addInitializedInputTensor(
            np.random.rand(dsize, dsize).astype(np.float32), f"W{index}")
        matmul_id = builder.aiOnnx.matmul([in_id, w])
        return matmul_id

    out = ip
    l1 = ""
    final_loss = ""

    for i in range(num_layers):
        vgid = 0
        with builder.executionPhase(i * stride), builder.virtualGraph(vgid):
            for j in range(3):
                out = add_layer(i, out)

        if i == num_layers - 1:
            with builder.executionPhase(i *
                                        stride), builder.virtualGraph(vgid):
                l1 = builder.aiGraphcore.l1loss([out], 0.1,
                                                popart.ReductionType.Sum)
                final_loss = builder.aiGraphcore.identityloss([l1])

    anchorIds = []

    builder.addOutputTensor(out)

    num_ipus = 1

    dfAnchors = {}
    for anchorId in anchorIds:
        dfAnchors.update({anchorId: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()

    # Cycle counting
    opts.instrumentWithHardwareCycleCounter = True

    # Outlining
    opts.enableOutlining = enable_outlining
    opts.enableOutliningCopyCostPruning = False
    opts.outlineThreshold = -np.inf
    opts.aliasZeroCopy = enable_outlining

    # Replicated graphs
    opts.replicatedGraphCount = num_replicas
    opts.enableReplicatedGraphs = True if num_replicas > 1 else False

    # IO tiles
    opts.numIOTiles = 192

    # Phased execution
    opts.executionPhaseSettings.phases = num_layers * stride
    opts.executionPhaseSettings.stages = 1
    opts.executionPhaseSettings.schedule = schedule
    opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases

    # Recomputation
    opts.autoRecomputation = popart.RecomputationType.Standard
    opts.explicitRecomputation = True

    # Batch serialization
    if batch_serialize > 1:
        opts.batchSerializationSettings.factor = batch_serialize
        opts.batchSerializationSettings.concatOnVirtualGraphChange = False
        opts.batchSerializationSettings.concatOnExecutionPhaseChange = False
        opts.batchSerializationSettings.concatOnPipelineStageChange = False
        opts.batchSerializationSettings.batchSchedule = batch_schedule
        # Related execution phase setting
        opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

    # Streaming memory
    offChipLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    offChipRtsLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    opts.activationTensorLocationSettings = offChipLocation
    opts.weightTensorLocationSettings = offChipRtsLocation
    opts.optimizerStateTensorLocationSettings = offChipRtsLocation

    proto = builder.getModelProto()

    with tu.create_test_device(num_replicas * num_ipus,
                               pattern=popart.SyncPattern.Full) as device:

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dfAnchors),
            optimizer=optimizer,
            loss=final_loss,
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        for i in range(num_iterations):
            ip_data = np.random.rand(num_replicas, batch_size, dsize,
                                     dsize).astype(np.float32)
            stepio = popart.PyStepIO({ip: ip_data}, anchors)
            session.run(stepio)

        cycles = session.getCycleCount()

        print("anchors:")
        print(anchors)
        session.modelToHost(str(tmpdir / model_file_name))

        return cycles
示例#15
0
def bert_session_options(args, model):
    engine_options = {}
    options = popart.SessionOptions()
    options.virtualGraphMode = popart.VirtualGraphMode.Manual
    options.enableFloatingPointChecks = args.floating_point_exceptions
    options.enableStochasticRounding = args.stochastic_rounding
    options.enableGroupedMatmuls = False
    options.enablePrefetchDatastreams = not args.minimum_latency_inference
    options.enableOutlining = not args.no_outlining
    partials_type = "half" if args.enable_half_partials else "float"
    options.partialsTypeMatMuls = partials_type
    options.convolutionOptions = {'partialsType': partials_type}
    if args.replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = args.replication_factor
        engine_options["target.syncReplicasIndependently"] = "true"
    # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops
    # such as add or reshapeInplace.
    # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation.
    options.outlineThreshold = 10.0
    if args.execution_mode == "PIPELINE":
        options.enablePipelining = True
        options.autoRecomputation = popart.RecomputationType.Pipeline
    elif args.execution_mode == "PHASED":
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.enableOutliningCopyCostPruning = False
        options.outlineThreshold = -np.inf
        options.executionPhaseSettings.phases = model.total_execution_phases
        options.batchSerializationSettings.factor = args.batch_serialize
        options.autoRecomputation = popart.RecomputationType.Standard
        options.explicitRecomputation = True
        options.aliasZeroCopy = True

        options.activationTensorLocationSettings.location.storage = popart.TensorStorage.OffChip

        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip
        varLocation.loadTileSet = popart.TileSet.IO
        varLocation.storageTileSet = popart.TileSet.IO
        varLocation.replicatedTensorSharding = (
            popart.ReplicatedTensorSharding.On
            if args.replicated_weight_sharding else
            popart.ReplicatedTensorSharding.Off)

        options.weightTensorLocationSettings.location = varLocation
        options.optimizerStateTensorLocationSettings.location = varLocation
        options.accumulatorTensorLocationSettings.location = varLocation

        options.numIOTiles = args.num_io_tiles
        options.timeLimitScheduler = -1
        options.swapLimitScheduler = -1
        engine_options["target.syncReplicasIndependently"] = "false"
        if args.activations_on_chip:
            options.activationTensorLocationSettings = popart.TensorLocationSettings(
                popart.TensorStorage.OnChip, 0)

    if args.optimizer_state_offchip:
        options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
    if args.gradient_accumulation_factor > 1:
        options.enableGradientAccumulation = True
        options.accumulationFactor = args.gradient_accumulation_factor
        if args.gradient_reduction_type == "Mean":
            options.accumulationReductionType = popart.ReductionType.Mean

        # When not replicated SyncPattern.SinglePipeline will provide better overlap
        # than this option.
        if args.optimizer_state_offchip and args.replication_factor > 1:
            options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized,
                [0])
    if args.engine_cache is not None:
        options.enableEngineCaching = True
        options.cachePath = args.engine_cache
    if args.profile:
        options.enableEngineCaching = False
    options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count
    options.disableGradAccumulationTensorStreams = True
    if args.max_copy_merge_size == -1:
        logger.debug("No copy merge size limit applied")
    else:
        logger.warning(
            f"Copy merge size limit set to {args.max_copy_merge_size}")
        engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size)

    # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large
    # transposes before operations.
    if args.disable_fully_connected_pass:
        if args.task == "SQUAD" and args.sequence_length == 384:
            logger.warning(
                "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM."
            )
        options.enableFullyConnectedPass = False

    if args.inference and args.engine_cache is not None and not args.variable_weights_inference:
        logger.warning(
            "Using engine cache with constant weights. Checkpoint weights will be ignored. "
            "Use the `--variable-weights-inference` flag if checkpoint weights should be used."
        )

    if args.variable_weights_inference:
        options.constantWeights = False

    if args.group_host_syncs:
        options.groupHostSync = True

    if args.internal_exchange_optimisation_target is not None:
        engine_options["opt.internalExchangeOptimisationTarget"] = str(
            args.internal_exchange_optimisation_target)

    options.engineOptions = engine_options

    # Set synthetic data mode (if active)
    if args.synthetic_data:
        if args.synthetic_data_initializer == "zeros":
            options.syntheticDataMode = popart.SyntheticDataMode.Zeros
        else:
            options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        logger.info(
            f"Running with Synthetic Data Type '{options.syntheticDataMode}'")
    return options
示例#16
0
def session(train=False,
            skip_execution=False,
            include_patterns=True,
            splits=1,
            outline=False,
            optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    # Required
    extraPatterns = []
    if include_patterns:
        extraPatterns += ["TiedGatherPattern", "TiedGatherAccumulatePattern"]
    patterns = popart.Patterns()
    for extraPattern in extraPatterns:
        patterns.enablePattern(extraPattern, True)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
    }

    if optim == "Lamb":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.1, True),
                "lossScaling": (20, False),
            },
            mode=popart.AdamMode.LambNoBias
        )  # NoBias to increase the error of incorrect gradients
        user_options[
            "optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
                popart.TensorStorage.OffChip, 0)
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening":
            (0,
             True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, False)
        })

    if train:
        return run_py(proto,
                      data=data,
                      outputs=x,
                      loss=loss,
                      optimizer=optimizer,
                      patterns=patterns,
                      user_options=user_options,
                      skip_execution=skip_execution)
    else:
        return run_py(proto,
                      data=data,
                      outputs=x,
                      patterns=patterns,
                      user_options={
                          "enableOutlining": outline,
                          "constantWeights": False
                      },
                      skip_execution=skip_execution)