Python TensorLocation примеры, popart.TensorLocation Python примеры использования

Пример #1

0

Показать файл

Файл: streamingmemory_tensor_location_test.py Проект: graphcore/popart

def test_accumulator_tensor_location_settings_plus_override():
    # Check optimizer state tensor location settings work
    optimizer_with_state = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.0, False),
        "defaultWeightDecay": (0.0, False),
        "defaultDampening": (0.0, True)
    })
    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1'],
             check_offchip=['Accl___W2', 'Accl___W0'])

    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W2', 'Accl___W0'],
             check_offchip=['Accl___W1'])

Пример #2

0

Показать файл

Файл: streamingmemory_tensor_location_test.py Проект: graphcore/popart

def test_activation_tensor_location_settings_plus_override():
    # Check weight tensor location settings work.
    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        tensor_location_setting_override={
            'MatMul:0/1__t6':
            popart.TensorLocation(popart.TensorStorage.OnChip)
        })
    check_ir(ir,
             check_onchip=['MatMul:0/1__t6'],
             check_offchip=['MatMul:0__t3'])

    ir = get_ir(
        num_layers=5,
        activation_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        tensor_location_setting_override={
            'MatMul:0/1__t6':
            popart.TensorLocation(popart.TensorStorage.OffChip)
        })
    check_ir(ir,
             check_onchip=['MatMul:0__t3'],
             check_offchip=['MatMul:0/1__t6'])

Пример #3

0

Показать файл

Файл: streamingmemory_tensor_location_test.py Проект: graphcore/popart

def test_weight_tensor_location_settings_plus_override():
    # Check weight tensor location settings work.
    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OffChip, 0),
                tensor_location_setting_override={
                    'W2': popart.TensorLocation(popart.TensorStorage.OnChip)
                })
    check_ir(ir, check_onchip=['W2'], check_offchip=['W0', 'W1'])

    ir = get_ir(weight_tensor_location_settings=popart.TensorLocationSettings(
        popart.TensorStorage.OnChip, 0),
                tensor_location_setting_override={
                    'W1': popart.TensorLocation(popart.TensorStorage.OffChip)
                })
    check_ir(ir, check_onchip=['W0', 'W2'], check_offchip=['W1'])

Пример #4

0

Показать файл

def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    patterns = popart.Patterns()
    patterns.enablePattern("TiedGatherPattern", include_patterns)
    patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "accumulationAndReplicationReductionType": popart.ReductionType.Mean,
        "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running
    }

    if optim == "Lamb":
        optimizer = popart.Adam({
            "defaultLearningRate": (0.1, False),
            "defaultWeightDecay": (0.1, True),
            "defaultBeta1": (0.1, True),
            "defaultBeta2": (0.1, True),
            "lossScaling": (20, True),
        }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients
        user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
            popart.TensorLocation(
                popart.TensorStorage.OffChip,
                popart.ReplicatedTensorSharding.On),
            0, 0)
        user_options["enableReplicatedGraphs"] = True
        user_options["replicatedGraphCount"] = 2
        ipus = 2
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, True)})
        ipus = 1

    if train:
        return run_py(
            proto,
            data=data,
            outputs=x,
            loss=loss,
            optimizer=optimizer,
            patterns=patterns,
            user_options=user_options,
            skip_execution=skip_execution)
    else:
        return run_py(
            proto,
            data=data,
            outputs=x,
            patterns=patterns,
            user_options={
                "enableOutlining": outline,
                "constantWeights": False
            },
            skip_execution=skip_execution)

Пример #5

0

Показать файл

def bert_optimizer_location_settings(args):
    storage = popart.TensorStorage.OnChip
    if args.optimizer_state_offchip:
        storage = popart.TensorStorage.OffChip
    rts = popart.ReplicatedTensorSharding.Off
    if args.replicated_tensor_sharding:
        rts = popart.ReplicatedTensorSharding.On

    return popart.TensorLocationSettings(popart.TensorLocation(storage, rts))

Пример #6

0

Показать файл

def set_ema_weights_offchip(session_options, ema_weight_names):
    """ Sets the tensor locations of EMA weights to be off-chip """

    tensor_location_override_dict = dict()
    for nname in network_names:
        for _, ema_wname in ema_weight_names[nname]:
            tensor_location_override_dict[ema_wname] = popart.TensorLocation(
                popart.TensorStorage.OffChip)
            logger.info("Setting tensor-location for {} to be OffChip".format(
                ema_wname))
    session_options.tensorLocationSettingsOverride = tensor_location_override_dict

    return

Пример #7

0

Показать файл

def set_phased_options(options, engine_options, model, args):
    options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
    options.enableOutliningCopyCostPruning = False
    options.outlineThreshold = -np.inf
    options.outlineSequenceBreakCost = 100000.0
    options.executionPhaseSettings.phases = model.total_execution_phases
    options.batchSerializationSettings.factor = args.batch_serialize
    options.batchSerializationSettings.transformContext = popart.BatchSerializationTransformContext.Fwd
    options.batchSerializationSettings.concatOnVirtualGraphChange = False
    options.batchSerializationSettings.concatOnExecutionPhaseChange = False
    options.batchSerializationSettings.concatOnPipelineStageChange = False
    options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute
    options.autoRecomputation = popart.RecomputationType.Standard
    options.explicitRecomputation = True
    options.aliasZeroCopy = True

    varLocation = popart.TensorLocation()
    varLocation.storage = popart.TensorStorage.OffChip
    varLocation.loadTileSet = popart.TileSet.IO
    varLocation.storageTileSet = popart.TileSet.IO
    varLocation.replicatedTensorSharding = (
        popart.ReplicatedTensorSharding.On if args.replicated_tensor_sharding
        else popart.ReplicatedTensorSharding.Off)

    options.weightTensorLocationSettings.location = varLocation
    options.optimizerStateTensorLocationSettings.location = varLocation
    options.accumulatorTensorLocationSettings.location = varLocation
    options.activationTensorLocationSettings.location = varLocation

    if args.tensor_storage_onchip:
        options.weightTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
        options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
        options.accumulatorTensorLocationSettings.location.storage = popart.TensorStorage.OnChip
    options.executionPhaseSettings.activationIOSchedule = io_schedule(
        args.activation_io_schedule)
    options.executionPhaseSettings.weightIOSchedule = io_schedule(
        args.weight_io_schedule)
    options.executionPhaseSettings.schedule = optimizer_schedule(
        args.optimizer_schedule)
    options.numIOTiles = args.num_io_tiles
    engine_options["target.syncReplicasIndependently"] = "false"
    if args.activations_on_chip:
        options.activationTensorLocationSettings = popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0)

Пример #8

0

Показать файл

Файл: streamingmemory_attention_test.py Проект: 546287123/popart

def test_attention_streamingmemory(tmpdir):
    np.random.seed(0XDEAD1337)
    batches_per_step = 5
    batch_size = 8
    hidden_size = 16
    sequence_length = 8
    attention_heads = 4
    qkv_length = hidden_size / attention_heads

    input_shape = [batch_size * sequence_length, hidden_size]
    mask_shape = [batch_size, 1, 1, sequence_length]

    qkv_data = np.random.normal(
        0, 0.02, [hidden_size, hidden_size * 3]).astype(np.float32)

    r = np.arange(0, sequence_length)
    r = np.reshape(batch_size * [r], mask_shape)
    masks = []
    for i in range(batches_per_step):
        masks.append(np.less(r, i).astype(np.float32))
    mask_data = (1 - np.stack(masks)) * -1000.0

    input_data = np.random.normal(0, 0.02, [batches_per_step] +
                                  input_shape).astype(np.float32)

    def run_test(index, options):
        per_replica_batch_size = batch_size / options["replication"]
        model_input_shape = input_shape[:]
        model_input_shape[0] = int(model_input_shape[0] /
                                   options["replication"])
        model_mask_shape = mask_shape[:]
        model_mask_shape[0] = int(model_mask_shape[0] / options["replication"])

        stride = 2 // options["stages"]
        if "stride" in options and options["stride"]:
            stride = options["stride"]

        builder = popart.Builder(opsets={
            "ai.onnx": 9,
            "ai.onnx.ml": 1,
            "ai.graphcore": 1
        })

        mask = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_mask_shape), "mask")
        x_in = builder.addInputTensor(
            popart.TensorInfo("FLOAT", model_input_shape), "x_in")

        anchors = {}
        x = x_in
        for i in range(options["numLayers"]):
            qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}")
            anchors[popart.reservedGradientPrefix() +
                    qkv] = popart.AnchorReturnType("All")

            vgid = (i % options["stages"]) if options["phasedExecution"] else i

            with builder.virtualGraph(vgid), builder.executionPhase(i *
                                                                    stride):
                x = builder.aiOnnx.matmul([x, qkv])
                x = attention_onnx(builder, x, mask, per_replica_batch_size,
                                   sequence_length, hidden_size,
                                   attention_heads, qkv_length)

        vgid = ((options["numLayers"] - 1) % options["stages"]
                ) if options["phasedExecution"] else options["numLayers"] - 1

        with builder.virtualGraph(vgid), builder.executionPhase(
            (options["numLayers"] - 1) * stride):
            l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum)

        proto = builder.getModelProto()

        gradient_keys = list(anchors.keys())
        anchors[x] = popart.AnchorReturnType("All")

        dataFlow = popart.DataFlow(batches_per_step, anchors)

        opts = popart.SessionOptions()
        opts.executionPhaseSettings.stages = options["stages"]

        opts.executionPhaseSettings.phases = (
            options["numLayers"] * stride if options["phasedExecution"] else 0)
        opts.enableOutlining = options["outlining"]

        if "phaseSchedule" in options:
            opts.executionPhaseSettings.schedule = options["phaseSchedule"]

        # Phased execution currently does its own recompute annotations
        opts.autoRecomputation = (popart.RecomputationType.Standard
                                  if options["explicitRecomputation"] else
                                  popart.RecomputationType.NoRecompute)

        opts.outlineThreshold = -np.inf
        opts.enableOutliningCopyCostPruning = False
        opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases
                                 if options["phasedExecution"] else
                                 popart.VirtualGraphMode.Manual)
        opts.explicitRecomputation = options["explicitRecomputation"]
        opts.aliasZeroCopy = options["aliasZeroCopy"]

        opts.batchSerializationSettings.factor = options["batchSerialize"]
        if "batchSchedule" in options:
            opts.batchSerializationSettings.batchSchedule = options[
                "batchSchedule"]
        if "batchConcat" in options:
            # Do not concatenate the batch across phases and virtual graphs
            # (causes more, smalle transfers but allows for individual sub-batch
            # elements to be transferred)
            opts.batchSerializationSettings.concatOnVirtualGraphChange = options[
                "batchConcat"]
            opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[
                "batchConcat"]
            # Wait with loading activations until they are required
            opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

        if "tensorLocationSettings" in options and options[
                "tensorLocationSettings"]:
            opts.activationTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.weightTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.optimizerStateTensorLocationSettings = options[
                "tensorLocationSettings"]
            opts.accumulatorTensorLocationSettings = options[
                "tensorLocationSettings"]
        if "weightTensorLocationSettings" in options and options[
                "weightTensorLocationSettings"]:
            opts.weightTensorLocationSettings = options[
                "weightTensorLocationSettings"]
        if options["replication"] > 1:
            opts.replicatedGraphCount = options["replication"]
            opts.enableReplicatedGraphs = True
        if "ioTiles" in options:
            opts.numIOTiles = options["ioTiles"]

        pat = popart.Patterns(popart.PatternsLevel.Default)
        if options["phasedExecution"]:
            numIpus = options["stages"]
        else:
            numIpus = options["numLayers"] + 1
        if options["replication"] > 1:
            numIpus = numIpus * options["replication"]
        device = tu.create_test_device(numIpus,
                                       pattern=popart.SyncPattern.Full)

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=opts,
                                         loss=l1,
                                         optimizer=popart.ConstSGD(0.1),
                                         patterns=pat,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()
        for k, v in anchors.items():
            print(f"anchor_before {k}={v.shape}")

        inputs = {x_in: input_data, mask: mask_data}
        stepio = popart.PyStepIO(inputs, anchors)

        for __ in range(10):
            session.run(stepio)

        session.modelToHost(
            str(tmpdir / f"streamingmemory_attention_{index}.onnx"))

        if options["replication"] > 1:
            for k, v in anchors.items():
                if k in gradient_keys:
                    # The gradient anchors will have an additional replication axis.
                    anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0)
                else:
                    # Output tensor needs reshaping.
                    anchors[k] = np.reshape(anchors[k], [
                        batches_per_step, sequence_length * batch_size,
                        hidden_size
                    ])
            for k, v in anchors.items():
                print(f"anchor_after {k}={v.shape}")

        return anchors

    test_results = []

    # AliasZeroCopy only supported with explicit recomputation, but not with
    # standard recomputation
    # Phased execution only supported with explicit recomputaton, but not with
    # standard recomputation

    test_variants = []

    defaultOffChip = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    ioOffChip = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    # Ground truth variant
    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": False,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": False,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 4,
        "replication": 1,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": False,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": False,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": False,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    test_variants.append({
        "stages": 2,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 1,
        "replication": 1,
        "tensorLocationSettings": defaultOffChip,
    })

    # Test batch serialized single device per replica execution, where all
    # streaming memory traffic goes through IO tiles, and activations are
    # stored and loaded one-by-one
    test_variants.append({
        "stages": 1,
        "stride": 4,
        "numLayers": 3,
        "phasedExecution": True,
        "outlining": True,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 4,
        "batchConcat": False,
        "replication": 2,
        "tensorLocationSettings": ioOffChip,
        "ioTiles": 192
    })

    # Test batch serialized single device per replica execution, where all
    # streaming memory traffic goes through IO tiles, and loading of the next
    # phase happens before storing the current phase
    test_variants.append({
        "stages": 1,
        "stride": 1,
        "numLayers": 3,
        "phasedExecution": True,
        "phaseSchedule": popart.ExecutionPhaseSchedule.BatchClusteredIO,
        "outlining": False,
        "explicitRecomputation": True,
        "aliasZeroCopy": True,
        "batchSerialize": 4,
        "batchConcat": True,
        "replication": 2,
        "tensorLocationSettings": ioOffChip,
        "ioTiles": 192
    })

    # Test a variety of batch serialisation schedules.
    for batchSchedule in [
            popart.BatchSerializationBatchSchedule.Scheduler,
            popart.BatchSerializationBatchSchedule.Isomorphic,
            popart.BatchSerializationBatchSchedule.OverlapOnIo,
            popart.BatchSerializationBatchSchedule.OverlapOnCompute,
    ]:

        test_variants.append({
            "stages": 1,
            "stride": 4,
            "numLayers": 3,
            "phasedExecution": True,
            "outlining": False,
            "explicitRecomputation": True,
            "aliasZeroCopy": True,
            "batchSerialize": 4,
            "batchSchedule": batchSchedule,
            "batchConcat": False,
            "replication": 2,
            "tensorLocationSettings": ioOffChip,
            "ioTiles": 192
        })

    # Test replicated tensor sharding + on chip (no outlining).
    test_variants.append({
        "stages":
        2,
        "numLayers":
        3,
        "phasedExecution":
        True,
        "outlining":
        False,
        "explicitRecomputation":
        False,
        "aliasZeroCopy":
        False,
        "batchSerialize":
        1,
        "replication":
        2,
        "tensorLocationSettings":
        defaultOffChip,
        "weightTensorLocationSettings":
        popart.TensorLocationSettings(location=popart.TensorLocation(
            storage=popart.TensorStorage.OnChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
                                      minElementsForOffChip=0,
                                      minElementsForReplicatedTensorSharding=2)
    })

    # Test replicated tensor sharding + off chip (no outlining).
    test_variants.append({
        "stages":
        2,
        "numLayers":
        3,
        "phasedExecution":
        True,
        "outlining":
        False,
        "explicitRecomputation":
        False,
        "aliasZeroCopy":
        False,
        "batchSerialize":
        1,
        "replication":
        2,
        "tensorLocationSettings":
        defaultOffChip,
        "weightTensorLocationSettings":
        popart.TensorLocationSettings(location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.Compute,
            storageTileSet=popart.TileSet.Compute,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
                                      minElementsForOffChip=0,
                                      minElementsForReplicatedTensorSharding=2)
    })

    index = 0
    for test_option in test_variants:
        print(f"Running {index}: {test_option}")
        test_results.append(run_test(index, test_option))
        index += 1

    gt_onnx = onnx.load(str(tmpdir / f"streamingmemory_attention_0.onnx"))

    for i in range(1, index):
        print(f"Testing run {i}: {test_variants[i]}")
        for key in test_results[0].keys():
            assert np.all(
                np.isclose(test_results[0][key],
                           test_results[i][key],
                           equal_nan=False))

        val_onnx = onnx.load(
            str(tmpdir / f"streamingmemory_attention_{i}.onnx"))
        for j in range(len(gt_onnx.graph.initializer)):
            print(f"Checking initializer {j}")
            gt = gt_onnx.graph.initializer[j]
            gt = numpy_helper.to_array(gt)
            val = val_onnx.graph.initializer[j]
            val = numpy_helper.to_array(val)
            assert np.allclose(gt, val, equal_nan=False)

Пример #9

0

Показать файл

Файл: aliaszerocopy_model_test.py Проект: graphcore/popart

    def run_test(aliaszerocopy):
        proto, data, x, loss = model()

        options = popart.SessionOptions()
        patterns = popart.Patterns()

        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True)
        })

        options.enableOutlining = True
        options.outlineThreshold = -np.inf
        options.enableOutliningCopyCostPruning = False
        options.autoRecomputation = popart.RecomputationType.Standard
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.explicitRecomputation = True
        options.aliasZeroCopy = aliaszerocopy
        options.executionPhaseSettings.phases = 5
        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip

        options.weightTensorLocationSettings.location = varLocation
        options.optimizerStateTensorLocationSettings.location = varLocation
        options.accumulatorTensorLocationSettings.location = varLocation
        options.activationTensorLocationSettings.location = varLocation

        tempDir = tempfile.TemporaryDirectory()
        options.engineOptions["autoReport.directory"] = tempDir.name
        options.engineOptions["autoReport.all"] = "true"

        request_ipus = 2

        device = tu.create_test_device(2, pattern=popart.SyncPattern.Full)

        dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")})

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        stepio = popart.PyStepIO(data, anchors)

        session.run(stepio)

        file_path = str(tmpdir / f"aliaszerocopy_model_test.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

        device.detach()

        report = session.getReport()
        max_tile_memory = max([
            tile.memory.total.excludingGaps
            for tile in report.compilation.tiles
        ])
        total_memory = np.sum([
            tile.memory.total.excludingGaps
            for tile in report.compilation.tiles
        ])

        return anchors[x], post_proto, total_memory

Пример #10

0

Показать файл

    for i in range(len(lhs_model.graph.initializer)):
        lhs = lhs_model.graph.initializer[i]
        for j in range(len(rhs_model.graph.initializer)):
            rhs = rhs_model.graph.initializer[j]
            if (rhs.name == lhs.name):
                print(f'Checking initializer {i} ({lhs.name} - {rhs.name})')
                lhsa = numpy_helper.to_array(lhs)
                rhsa = numpy_helper.to_array(rhs)
                assert np.allclose(lhsa, rhsa, rtol=1.e-4, atol=1.e-5)


# Standard OnChip settings
onChipLocation = popart.TensorLocationSettings(
    location=popart.TensorLocation(
        storage=popart.TensorStorage.OnChip,
        loadTileSet=popart.TileSet.Compute,
        storageTileSet=popart.TileSet.Compute,
        replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
    minElementsForOffChip=0,
    minElementsForReplicatedTensorSharding=2)

# Standard OffChip settings
offChipLocation = popart.TensorLocationSettings(
    location=popart.TensorLocation(
        storage=popart.TensorStorage.OffChip,
        loadTileSet=popart.TileSet.Compute,
        storageTileSet=popart.TileSet.Compute,
        replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
    minElementsForOffChip=0,
    minElementsForReplicatedTensorSharding=2)

Пример #11

0

Показать файл

Файл: streamingmemory_overlap_test.py Проект: graphcore/popart

def run_model(tmpdir,
              model_file_name,
              schedule=popart.ExecutionPhaseSchedule.Interleaving,
              enable_outlining=False,
              stride=1,
              num_layers=5,
              dsize=128,
              batch_size=4,
              batch_serialize=1,
              batch_schedule=popart.BatchSerializationBatchSchedule.Isomorphic,
              num_iterations=5,
              num_replicas=2,
              optimizer=popart.Adam({"defaultLearningRate": (0.1, False)})):

    np.random.seed(52125)

    builder = popart.Builder()
    ip = builder.addInputTensor(
        popart.TensorInfo("FLOAT", [batch_size, dsize, dsize]))

    def add_layer(index, in_id):
        w = builder.addInitializedInputTensor(
            np.random.rand(dsize, dsize).astype(np.float32), f"W{index}")
        matmul_id = builder.aiOnnx.matmul([in_id, w])
        return matmul_id

    out = ip
    l1 = ""
    final_loss = ""

    for i in range(num_layers):
        vgid = 0
        with builder.executionPhase(i * stride), builder.virtualGraph(vgid):
            for j in range(3):
                out = add_layer(i, out)

        if i == num_layers - 1:
            with builder.executionPhase(i *
                                        stride), builder.virtualGraph(vgid):
                l1 = builder.aiGraphcore.l1loss([out], 0.1,
                                                popart.ReductionType.Sum)
                final_loss = builder.aiGraphcore.identityloss([l1])

    anchorIds = []

    builder.addOutputTensor(out)

    num_ipus = 1

    dfAnchors = {}
    for anchorId in anchorIds:
        dfAnchors.update({anchorId: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()

    # Cycle counting
    opts.instrumentWithHardwareCycleCounter = True

    # Outlining
    opts.enableOutlining = enable_outlining
    opts.enableOutliningCopyCostPruning = False
    opts.outlineThreshold = -np.inf
    opts.aliasZeroCopy = enable_outlining

    # Replicated graphs
    opts.replicatedGraphCount = num_replicas
    opts.enableReplicatedGraphs = True if num_replicas > 1 else False

    # IO tiles
    opts.numIOTiles = 192

    # Phased execution
    opts.executionPhaseSettings.phases = num_layers * stride
    opts.executionPhaseSettings.stages = 1
    opts.executionPhaseSettings.schedule = schedule
    opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases

    # Recomputation
    opts.autoRecomputation = popart.RecomputationType.Standard
    opts.explicitRecomputation = True

    # Batch serialization
    if batch_serialize > 1:
        opts.batchSerializationSettings.factor = batch_serialize
        opts.batchSerializationSettings.concatOnVirtualGraphChange = False
        opts.batchSerializationSettings.concatOnExecutionPhaseChange = False
        opts.batchSerializationSettings.concatOnPipelineStageChange = False
        opts.batchSerializationSettings.batchSchedule = batch_schedule
        # Related execution phase setting
        opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand

    # Streaming memory
    offChipLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.Off),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    offChipRtsLocation = popart.TensorLocationSettings(
        location=popart.TensorLocation(
            storage=popart.TensorStorage.OffChip,
            loadTileSet=popart.TileSet.IO,
            storageTileSet=popart.TileSet.IO,
            replicatedTensorSharding=popart.ReplicatedTensorSharding.On),
        minElementsForOffChip=0,
        minElementsForReplicatedTensorSharding=2)

    opts.activationTensorLocationSettings = offChipLocation
    opts.weightTensorLocationSettings = offChipRtsLocation
    opts.optimizerStateTensorLocationSettings = offChipRtsLocation

    proto = builder.getModelProto()

    with tu.create_test_device(num_replicas * num_ipus,
                               pattern=popart.SyncPattern.Full) as device:

        session = popart.TrainingSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dfAnchors),
            optimizer=optimizer,
            loss=final_loss,
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        for i in range(num_iterations):
            ip_data = np.random.rand(num_replicas, batch_size, dsize,
                                     dsize).astype(np.float32)
            stepio = popart.PyStepIO({ip: ip_data}, anchors)
            session.run(stepio)

        cycles = session.getCycleCount()

        print("anchors:")
        print(anchors)
        session.modelToHost(str(tmpdir / model_file_name))

        return cycles

Пример #12

0

Показать файл

    def __init__(self,
                 name: str,
                 input_size,
                 hidden_size,
                 num_heads,
                 serialize_matmul,
                 available_memory_proportion,
                 epsilon,
                 dropout,
                 dropout_prob,
                 attn_dropout,
                 attn_dropout_prob,
                 batch_size,
                 sequence_length,
                 dtype,
                 task,
                 num_mask_tokens,
                 split_qkv=False,
                 residual=True,
                 prefetch_masks=True,
                 use_default_mem_proportion=True,
                 mask=None,
                 **kwargs):
        if split_qkv:
            params = [
                Parameter(name='Q',
                          shape=[input_size, hidden_size],
                          value=None),
                Parameter(name='K',
                          shape=[input_size, hidden_size],
                          value=None),
                Parameter(name='V',
                          shape=[input_size, hidden_size],
                          value=None),
                Parameter(name='Out',
                          shape=[hidden_size, input_size],
                          value=None)
            ]
        else:
            params = [
                Parameter(name='QKV',
                          shape=[input_size, 3 * hidden_size],
                          value=None),
                Parameter(name='Out',
                          shape=[hidden_size, input_size],
                          value=None)
            ]
        scope_provider = kwargs['scope_provider']
        super(Attention,
              self).__init__(params=params,
                             scope=scope_provider.get_scope(name, 'next'),
                             dtype=dtype,
                             **kwargs)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.serialize_matmul = serialize_matmul
        self.available_memory_proportion = available_memory_proportion
        self.use_default_mem_proportion = use_default_mem_proportion
        self.split_qkv = split_qkv
        self.batch_size = batch_size
        self.seq_len = sequence_length
        if hidden_size % num_heads != 0:
            raise ValueError('Hidden size must be a multiple of num_heads')
        self.qkv_length = hidden_size // num_heads
        self.dtype = dtype
        self.residual = residual
        self.task = task
        self.num_mask_tokens = num_mask_tokens
        self.mask = mask
        self.prefetch_masks = prefetch_masks
        if prefetch_masks:
            additional_scopes = [
                self.builder.recomputeOutput(popart.RecomputeType.Checkpoint),
                self.builder.outputTensorLocation(
                    popart.TensorLocation(popart.TensorStorage.OnChip))
            ]
            self.mask_execution_phase = scope_provider.get_scope(
                'Mask', 'prev').execution_phase % 2
            self.mask_scope = scope_provider.get_scope(
                'Mask',
                self.mask_execution_phase,
                additional_scopes=additional_scopes)
        else:
            self.mask_scope = scope_provider.get_scope('Mask', 'prev')

        if self.residual:
            self.norm = Norm(scope_provider.get_scope('Norm', 'prev'),
                             hidden_size, epsilon, dtype, **kwargs)
        if dropout:
            self.dropout = Dropout(scope_provider.get_scope('Dropout', 'prev'),
                                   dropout_prob, **kwargs)
        else:
            self.dropout = lambda x: x

        if attn_dropout:
            self.attn_dropout = Dropout(
                scope_provider.get_scope('AttnDropout', 'prev'),
                attn_dropout_prob, **kwargs)
        else:
            self.attn_dropout = lambda x: x

        self.total_execution_phases = self.total_phases()

Пример #13

0

Показать файл

Файл: bert.py Проект: muzzynine/examples-1

def bert_session_options(args, model):
    engine_options = {}
    options = popart.SessionOptions()
    options.virtualGraphMode = popart.VirtualGraphMode.Manual
    options.enableFloatingPointChecks = args.floating_point_exceptions
    options.enableStochasticRounding = args.stochastic_rounding
    options.enableGroupedMatmuls = False
    options.enablePrefetchDatastreams = not args.minimum_latency_inference
    options.enableOutlining = not args.no_outlining
    partials_type = "half" if args.enable_half_partials else "float"
    options.partialsTypeMatMuls = partials_type
    options.convolutionOptions = {'partialsType': partials_type}
    if args.replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = args.replication_factor
        engine_options["target.syncReplicasIndependently"] = "true"
    # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops
    # such as add or reshapeInplace.
    # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation.
    options.outlineThreshold = 10.0
    if args.execution_mode == "PIPELINE":
        options.enablePipelining = True
        options.autoRecomputation = popart.RecomputationType.Pipeline
    elif args.execution_mode == "PHASED":
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.enableOutliningCopyCostPruning = False
        options.outlineThreshold = -np.inf
        options.executionPhaseSettings.phases = model.total_execution_phases
        options.batchSerializationSettings.factor = args.batch_serialize
        options.autoRecomputation = popart.RecomputationType.Standard
        options.explicitRecomputation = True
        options.aliasZeroCopy = True

        options.activationTensorLocationSettings.location.storage = popart.TensorStorage.OffChip

        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip
        varLocation.loadTileSet = popart.TileSet.IO
        varLocation.storageTileSet = popart.TileSet.IO
        varLocation.replicatedTensorSharding = (
            popart.ReplicatedTensorSharding.On
            if args.replicated_weight_sharding else
            popart.ReplicatedTensorSharding.Off)

        options.weightTensorLocationSettings.location = varLocation
        options.optimizerStateTensorLocationSettings.location = varLocation
        options.accumulatorTensorLocationSettings.location = varLocation

        options.numIOTiles = args.num_io_tiles
        options.timeLimitScheduler = -1
        options.swapLimitScheduler = -1
        engine_options["target.syncReplicasIndependently"] = "false"
        if args.activations_on_chip:
            options.activationTensorLocationSettings = popart.TensorLocationSettings(
                popart.TensorStorage.OnChip, 0)

    if args.optimizer_state_offchip:
        options.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip
    if args.gradient_accumulation_factor > 1:
        options.enableGradientAccumulation = True
        options.accumulationFactor = args.gradient_accumulation_factor
        if args.gradient_reduction_type == "Mean":
            options.accumulationReductionType = popart.ReductionType.Mean

        # When not replicated SyncPattern.SinglePipeline will provide better overlap
        # than this option.
        if args.optimizer_state_offchip and args.replication_factor > 1:
            options.accumulateOuterFragmentSettings = popart.AccumulateOuterFragmentSettings(
                popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized,
                [0])
    if args.engine_cache is not None:
        options.enableEngineCaching = True
        options.cachePath = args.engine_cache
    if args.profile:
        options.enableEngineCaching = False
    options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count
    options.disableGradAccumulationTensorStreams = True
    if args.max_copy_merge_size == -1:
        logger.debug("No copy merge size limit applied")
    else:
        logger.warning(
            f"Copy merge size limit set to {args.max_copy_merge_size}")
        engine_options["opt.maxCopyMergeSize"] = str(args.max_copy_merge_size)

    # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large
    # transposes before operations.
    if args.disable_fully_connected_pass:
        if args.task == "SQUAD" and args.sequence_length == 384:
            logger.warning(
                "Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM."
            )
        options.enableFullyConnectedPass = False

    if args.inference and args.engine_cache is not None and not args.variable_weights_inference:
        logger.warning(
            "Using engine cache with constant weights. Checkpoint weights will be ignored. "
            "Use the `--variable-weights-inference` flag if checkpoint weights should be used."
        )

    if args.variable_weights_inference:
        options.constantWeights = False

    if args.group_host_syncs:
        options.groupHostSync = True

    if args.internal_exchange_optimisation_target is not None:
        engine_options["opt.internalExchangeOptimisationTarget"] = str(
            args.internal_exchange_optimisation_target)

    options.engineOptions = engine_options

    # Set synthetic data mode (if active)
    if args.synthetic_data:
        if args.synthetic_data_initializer == "zeros":
            options.syntheticDataMode = popart.SyntheticDataMode.Zeros
        else:
            options.syntheticDataMode = popart.SyntheticDataMode.RandomNormal
        logger.info(
            f"Running with Synthetic Data Type '{options.syntheticDataMode}'")
    return options

Пример #14

0

Показать файл

    opts = popart.SessionOptions()
    if args.profile:
        opts.engineOptions = {
            "autoReport.all": "true",
            "autoReport.directory": args.profile_dir
        }
    if phased_execution:
        # Constant weights cannot be streamed
        opts.constantWeights = False
        opts.executionPhaseSettings.phases = args.num_layers
        opts.executionPhaseSettings.stages = 2
        opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        opts.numIOTiles = 128

        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip
        varLocation.loadTileSet = popart.TileSet.IO
        varLocation.storageTileSet = popart.TileSet.IO
        opts.weightTensorLocationSettings.location = varLocation
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    print("Compiling.")
    session = popart.InferenceSession(fnModel=proto,
                                      dataFlow=popart.DataFlow(
                                          args.batches_per_step, anchor_map),
                                      userOptions=opts,
                                      deviceInfo=device)

    session.prepareDevice()

Пример #15

0

Показать файл

Файл: utils.py Проект: WN1695173791/examples

def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[str] = None,
           optimizer: Optional[popart.Optimizer] = None,
           patterns: Optional[popart.Patterns] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None,
           skip_execution: bool = False,
           execution_mode: str = 'DEFAULT',
           replication_factor: int = 1,
           replicated_tensor_sharding: bool = False,
           num_reps: int = 1):
    outputs = make_tuple(outputs)

    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step, {output: popart.AnchorReturnType("ALL")
                           for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.reportOptions = {"showVarStorage": "true"}
    if replicated_tensor_sharding:
        options.weightTensorLocationSettings.location.replicatedTensorSharding.On
        options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On
    if replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = replication_factor
    if execution_mode == 'PHASED':
        options.enableOutlining = True
        options.outlineThreshold = -np.inf
        options.enableOutliningCopyCostPruning = False
        options.autoRecomputation = popart.RecomputationType.Standard
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.explicitRecomputation = True
        options.aliasZeroCopy = True
        options.batchSerializationSettings.factor = user_options["batchSerializationFactor"]
        options.executionPhaseSettings.phases = user_options["executionPhases"]
        ipus = 1
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.outlineSequenceBreakCost = 100000.0
        options.batchSerializationSettings.concatOnVirtualGraphChange = False
        options.batchSerializationSettings.concatOnExecutionPhaseChange = False
        options.batchSerializationSettings.concatOnPipelineStageChange = False
        options.batchSerializationSettings.batchSchedule = popart.BatchSerializationBatchSchedule.OverlapOnCompute
        options.autoRecomputation = popart.RecomputationType.Standard

        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip
        varLocation.loadTileSet = popart.TileSet.IO
        varLocation.storageTileSet = popart.TileSet.IO
        options.weightTensorLocationSettings.location = varLocation
        options.optimizerStateTensorLocationSettings.location = varLocation
        options.accumulatorTensorLocationSettings.location = varLocation
        options.activationTensorLocationSettings.location = varLocation
        options.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand
        options.executionPhaseSettings.weightIOSchedule = popart.ExecutionPhaseIOSchedule.Preload
        options.executionPhaseSettings.schedule = popart.ExecutionPhaseSchedule.Batch
    else:
        options.enableGroupedMatmuls = False
        options.enableStochasticRounding = False
        options.constantWeights = True
        options.outlineThreshold = 10.0
        if ipus is not None and ipus > 1:
            options.virtualGraphMode = popart.VirtualGraphMode.Manual
        else:
            ipus = 1

    for key, value in user_options.items():
        if key not in ["batchSerializationFactor", "executionPhases"]:
            setattr(options, key, value)

    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true",
            "opt.internalExchangeOptimisationTarget": "balanced",
        }

    replicas = user_options.get("replicatedGraphCount", 1)
    request_ipus = pow(2, math.ceil(math.log2(ipus * replicas)))
    request_ipus *= replication_factor
    dm = popart.DeviceManager()
    dm.setOnDemandAttachTimeout(int(1e4))
    device = dm.acquireAvailableDevice(
        request_ipus,
        connectionType=popart.DeviceConnectionType.OnDemand,
        selectionCriterion=popart.DeviceSelectionCriterion.Random)
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFlow=data_flow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=data_flow,
                                          userOptions=options,
                                          patterns=patterns)

    if skip_execution:
        device.detach()
        return session

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.OutOfMemoryException as e:
        if return_stats and log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            gcprofile.save_popart_report(session,
                                         log_dir=log_dir,
                                         exception=e)
        device.detach()
        raise e
    print("Compilation complete")

    session.weightsFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    rf = user_options.get("replicatedGraphCount")
    if rf is not None and rf > 1:
        data = {k: np.repeat(v[np.newaxis], rf, 0)
                for k, v in data.items()}

    # Add a gradient accumulation factor dimension if needed
    af = user_options.get("accumulationFactor")
    if af is not None and af > 1:
        data = {k: np.repeat(v[np.newaxis], af, 0)
                for k, v in data.items()}

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {k: np.repeat(v[np.newaxis], batches_per_step, 0)
                for k, v in data.items()}

    for _ in range(num_reps):
        stepio = popart.PyStepIO(data, anchors)
        session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto

Python TensorLocation примеры использования