Exemplo n.º 1
0
 def __init__(self,
              learning_rate=0.001,
              momentum=0.0005,
              weight_decay=0.0001,
              use_locking=False,
              name='Momentum',
              use_nesterov=False,
              clip_norm=None,
              lossScaling=1.0,
              specific_dic={}):
     assert use_locking is False
     assert use_nesterov is False
     self.learning_rate = learning_rate
     self.name = name
     self.clip_norm = clip_norm
     self.lossScaling = lossScaling
     self.opti_cfg = {
         "defaultLearningRate": (self.learning_rate, False),
         "defaultMomentum": (momentum, True),
         "defaultWeightDecay": (weight_decay, True),
     }
     if self.lossScaling != 1.0:
         self.opti_cfg['lossScaling'] = (self.lossScaling, True)
     if clip_norm is not None:
         print('clip norm gradients:', clip_norm)
         self.gc_optimizer = popart.SGD(
             self.opti_cfg,
             clip_norm_settings=[
                 popart.ClipNormSettings.clipAllWeights(clip_norm)
             ])
     else:
         self.gc_optimizer = popart.SGD(self.opti_cfg)
     for name in specific_dic:
         self.gc_optimizer.insertSpecific(name, specific_dic[name])
Exemplo n.º 2
0
def test_sgd_param_check():
    """
    In this test we check that learning rate tensor, returned as an anchor,
    matches the value supplied to the optimizer constructor
    """

    lrName = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT"
    wdName = popart.reservedDefaultWeightDecayScaleFactor0Prefix() + "FLOAT"
    lsName = popart.reservedLossScalingPrefix() + "FLOAT"

    anchorNames = {
        lrName: popart.AnchorReturnType("All"),
        wdName: popart.AnchorReturnType("All"),
        lsName: popart.AnchorReturnType("All")
    }

    # Just a placeholder optimizer. We overwrite the hyper-parameters in this
    # test once the session is created
    userSGD = popart.SGD({
        "defaultLearningRate": (0.5, False),
        "defaultWeightDecay": (0.6, False),
        "lossScaling": (10.0, False)
    })
    stepSize = 2

    session, inputsUserSgd = trainSession(anchorNames, userSGD, stepSize)
    anchorsArrays = session.initAnchorArrays()

    # train
    numSteps = 3
    learningRate = np.random.rand(numSteps).astype('float32')
    weightDecay = np.random.rand(numSteps).astype('float32')
    lossScaling = np.random.rand(numSteps).astype('float32')

    for step in range(numSteps):

        # Update learning rate parameter between training steps
        stepLr = learningRate[step]
        stepWd = weightDecay[step]
        stepLs = lossScaling[step]
        session.updateOptimizerFromHost(
            popart.SGD({
                "defaultLearningRate": (stepLr, False),
                "defaultWeightDecay": (stepWd, False),
                "lossScaling": (stepLs, False)
            }))

        stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays)

        session.run(stepio)

        assert (np.array_equal(anchorsArrays[lsName][0], stepLs))

        scaled = (stepLr / stepLs)
        assert (np.array_equal(anchorsArrays[lrName][0], scaled))

        # The weight decay tensor is scaled by lr on the host
        # before training
        scaled = 1 - (stepWd * stepLr)
        assert (np.allclose(anchorsArrays[wdName][0], scaled))
Exemplo n.º 3
0
def getOptimizers():
    optimizers = []

    # SGD
    sgd0 = popart.SGD({
        "lossScaling": (10.0, False),
        "defaultMomentum": (0.5, False),
        "defaultVelocityScaling": (0.5, False),
        "defaultDampening": (0.5, False),
        "defaultWeightDecay": (0.5, False)
    })
    sgd1 = popart.SGD({
        "lossScaling": (0.2, False),
        "defaultMomentum": (0.2, False),
        "defaultVelocityScaling": (0.2, False),
        "defaultDampening": (0.2, False),
        "defaultWeightDecay": (0.2, False)
    })
    optimizers.append([sgd0, sgd1])

    # Adam
    adam0 = popart.Adam({
        "lossScaling": (10.0, False),
        "defaultLearningRate": (0.5, False),
        "defaultWeightDecay": (0.5, False),
        "defaultBeta1": (0.5, False),
        "defaultBeta2": (0.5, False),
        "defaultEps": (0.5, False)
    })
    adam1 = popart.Adam({
        "lossScaling": (0.2, False),
        "defaultLearningRate": (0.2, False),
        "defaultWeightDecay": (0.2, False),
        "defaultBeta1": (0.2, False),
        "defaultBeta2": (0.2, False),
        "defaultEps": (0.2, False)
    })
    optimizers.append([adam0, adam1])

    # Adaptive
    adaptive0 = popart.Adaptive({
        "lossScaling": (10.0, False),
        "defaultLearningRate": (0.5, False),
        "defaultAlpha": (0.5, False),
        "defaultMomentum": (0.5, False),
        "defaultWeightDecay": (0.5, False),
        "defaultEps": (0.5, False)
    })
    adaptive1 = popart.Adaptive({
        "lossScaling": (0.2, False),
        "defaultLearningRate": (0.2, False),
        "defaultAlpha": (0.2, False),
        "defaultMomentum": (0.2, False),
        "defaultWeightDecay": (0.2, False),
        "defaultEps": (0.2, False)
    })
    optimizers.append([adaptive0, adaptive1])

    return optimizers
Exemplo n.º 4
0
def test_replicated_sgd1_weight_update(tmpdir):

    optimizer_dict = {
        "defaultLearningRate": (0.00001, False),
        "defaultMomentum": (0.9, False),
        "defaultDampening": (0.2, False),
        "defaultVelocityScaling": (0.1, False),
        "lossScaling": (1.0, True),
        "defaultWeightDecay": (0.2, True)
    }

    run_model(tmpdir,
              'phased.onnx',
              execution_mode="phased",
              batch_size=2,
              num_replicas=1,
              num_iterations=5,
              optimizer=popart.SGD(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation)
    run_model(tmpdir,
              'phased_replicated.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.SGD(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation)
    run_model(tmpdir,
              'phased_replicated_rws.onnx',
              execution_mode="phased",
              batch_size=1,
              num_replicas=2,
              num_iterations=5,
              optimizer=popart.SGD(optimizer_dict),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipRtsLocation,
              optimizer_state_tensor_location_settings=offChipRtsLocation,
              accumulator_tensor_location_settings=offChipRtsLocation)

    phased = onnx.load(str(tmpdir / 'phased.onnx'))
    phased_replicated = onnx.load(str(tmpdir / 'phased_replicated.onnx'))
    phased_replicated_rws = onnx.load(
        str(tmpdir / 'phased_replicated_rws.onnx'))

    check_model(phased, phased_replicated)
    check_model(phased, phased_replicated_rws)
Exemplo n.º 5
0
def test_sgd_mixed_mode_0(tmpdir):

    #optimizer parameters
    defaultLearningRate = 1e-4
    defaultMomentum = 0.7
    defaultVelocityScaling = 1.0
    defaultWeightDecay = 0.1
    defaultDampening = 0.05
    lossScaling = 10.0

    optMaps = [{
        0:
        popart.SGD({
            "defaultLearningRate": (defaultLearningRate, True),
            "defaultMomentum": (defaultMomentum, True),
            "defaultVelocityScaling": (defaultVelocityScaling, True),
            "defaultWeightDecay": (defaultWeightDecay, True),
            "defaultDampening": (defaultDampening, True),
            "lossScaling": (lossScaling, True),
        })
    }]
    outlining = [False]

    for i in range(6):
        optMap = {
            "defaultLearningRate": (defaultLearningRate, i != 0),
            "defaultMomentum": (defaultMomentum, i != 1),
            "defaultVelocityScaling": (defaultVelocityScaling, i != 2),
            "defaultWeightDecay": (defaultWeightDecay, i != 3),
            "defaultDampening": (defaultDampening, i != 4),
            "lossScaling": (lossScaling, i != 5),
        }
        optMaps = optMaps + [{0: popart.SGD(optMap)}]
        outlining = outlining + [False]

    for i in range(6):
        optMap = {
            "defaultLearningRate": (defaultLearningRate, i != 0),
            "defaultMomentum": (defaultMomentum, i != 1),
            "defaultVelocityScaling": (defaultVelocityScaling, i != 2),
            "defaultWeightDecay": (defaultWeightDecay, i != 3),
            "defaultDampening": (defaultDampening, i != 4),
            "lossScaling": (lossScaling, i != 5),
        }
        optMaps = optMaps + [{0: popart.SGD(optMap)}]
        outlining = outlining + [True]

    run_sgd_mixed_mode(10, optMaps, outlining, tmpdir, np.float32)
    run_sgd_mixed_mode(10, optMaps, outlining, tmpdir, np.float16)
Exemplo n.º 6
0
def test_sgd_with_zero_learning_rate():
    """
    In this test we check that we can run a training step zero learning rate,
    and that it behaves as expected (i.e. no weight update)
    """

    # Let's start with an optimizer with a variable, non-zero learning rate
    optSettings = {
        "defaultLearningRate": (0.5, False),
        "defaultWeightDecay": (0.6, False),
        "lossScaling": (10.0, False)
    }
    stepSize = 2
    session, inputsUserSgd = trainSession({}, popart.SGD(optSettings),
                                          stepSize)
    anchorsArrays = session.initAnchorArrays()

    # Get the initial weights:
    fn = "init.onnx"
    session.modelToHost(fn)
    wId = "init_input"
    weights = {wId: np.empty(shape=[2, 2, 3, 3], dtype=np.float32)}
    weightsio = popart.PyWeightsIO(weights)
    session.readWeights(weightsio)
    init_weights = np.copy(weights[wId])

    # Run for a step with non-zero lr, observe that the weights have changed
    stepio = popart.PyStepIO(inputsUserSgd, anchorsArrays)
    session.run(stepio)
    session.weightsToHost()
    session.readWeights(weightsio)
    updated_weights = np.copy(weights[wId])
    assert np.array_equal(init_weights, updated_weights) is False

    # Update optimizer with zero lr, (only valid if variable)
    optSettings["defaultLearningRate"] = (0.0, True)
    with pytest.raises(popart.popart_exception) as e_info:
        session.updateOptimizerFromHost(popart.SGD(optSettings))
    assert e_info.value.args[0].startswith(
        "Constant, zero learning rate in SGD")

    # Run a training step, and confirm the weights haven't updated
    optSettings["defaultLearningRate"] = (0.0, False)
    session.updateOptimizerFromHost(popart.SGD(optSettings))

    session.weightsToHost()
    session.readWeights(weightsio)
    assert np.array_equal(weights[wId], updated_weights)
Exemplo n.º 7
0
    def create(self):
        self.iteration.learning_rate = self.optimizer_options[
            "defaultLearningRate"][0]

        optimizer = popart.SGD(self.optimizer_options)

        for stage in self.pipeline_stage_tensors:
            specific_parameters = {}
            if self.lr_scaling:
                default_lr, lr_is_const = self.optimizer_options[
                    "defaultLearningRate"]
                specific_parameters["learningRate"] = (
                    default_lr * self.pipeline_stage_lr_scaling[stage],
                    lr_is_const)
            if self.momentum_scaling:
                # Momentum values are scaled inverse to the pipeline_stage
                momentum = 1 - ((1 - self.option_values["defaultMomentum"]) *
                                self.pipeline_stage_momentum_scaling[stage])
                specific_parameters["momentum"] = (momentum, True)
                dampening = 1 - ((1 - self.option_values["defaultDampening"]) *
                                 self.pipeline_stage_dampening_scaling[stage])
                specific_parameters["dampening"] = (dampening, True)
            for tensor_id in self.pipeline_stage_tensors[stage]:
                optimizer.insertSpecific(tensor_id, specific_parameters)
        return optimizer
Exemplo n.º 8
0
def test_implicit_recompute_op_scheduled_pre_loss_no():
    """
    Regression test for T36828. Confirm that compilation completes without an
    exception being thrown.

    It is possible that the MulGrad op that produces Gradient___t3 is scheduled
    early (e.g. at index 0 in the schedule). If this happens, all ops after it
    in the schedule are classified as 'post loss'.

    The matmul operation is recomputed in the backwards pass. The implicit
    recomputation setting forbids that an op to be recomputed is a 'post loss'
    op.
    """
    builder = popart.Builder()
    t0 = builder.addInputTensor("FLOAT", [2, 2])
    t1 = builder.addInitializedInputTensor(
        np.random.rand(2, 2).astype(np.float32))
    t2 = builder.aiOnnx.matmul([t0, t1])
    t3 = builder.aiGraphcore.l1loss([t2], 0.1)

    const = np.array([4]).astype(np.float32)
    t5 = builder.aiOnnx.constant(const)
    t6 = builder.aiOnnx.mul([t3, t5])

    builder.recomputeOutputInBackwardPass(t2)

    session = popart.TrainingSession(deviceInfo=tu.create_test_device(),
                                     fnModel=builder.getModelProto(),
                                     dataFlow=popart.DataFlow(1, []),
                                     loss=t6,
                                     optimizer=popart.SGD(
                                         {"lossScaling": (2.0, False)}))

    session.prepareDevice()
def test_optimizer_state_tensor_location_settings():
    # Check optimizer state tensor location settings work.
    optimizer_with_state = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.0, False),
        "defaultWeightDecay": (0.0, False),
        "defaultDampening": (0.0, True)
    })
    ir = get_ir(optimizer_state_tensor_location_settings=None,
                optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'],
             check_offchip=[])

    ir = get_ir(
        optimizer_state_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=[],
             check_offchip=['Accl___W1', 'Accl___W2', 'Accl___W0'])

    ir = get_ir(
        optimizer_state_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1', 'Accl___W2', 'Accl___W0'],
             check_offchip=[])
Exemplo n.º 10
0
def step(session, anchors, data, update_optimizer_lr=None):
    if update_optimizer_lr is not None:
        optimizer = popart.SGD(update_optimizer_lr)
        session.updateOptimizer(optimizer)

    stepio = popart.PyStepIO(data, anchors)
    session.run(stepio)
Exemplo n.º 11
0
def test_auto_loss_scaling_with_mixed_precision_trackable_tensors():
    """
    Create a Session with automatic loss scaling and a model that contains
    both fp32 and fp16 initializers, and see that no incompatibility error is
    thrown.
    """
    builder = popart.Builder()
    t0 = builder.addInputTensor("FLOAT", [2, 2])
    t1_data = np.random.rand(2, 2).astype(np.float32)
    t1 = builder.addInitializedInputTensor(t1_data)
    mm0 = builder.aiOnnx.matmul([t0, t1])
    t2 = builder.aiOnnx.cast([mm0], "FLOAT16")
    t3 = builder.addInputTensor("FLOAT16", [2, 2])
    mm1 = builder.aiOnnx.matmul([t2, t3])
    loss = builder.aiGraphcore.identityloss([mm1])

    optimizer = popart.SGD({"lossScaling": (2, False)})

    opts = popart.SessionOptions()
    opts.automaticLossScalingSettings.enabled = True
    opts.automaticLossScalingSettings.binEdgeLocation = 0.5
    opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2

    session = popart.TrainingSession(builder.getModelProto(),
                                     deviceInfo=tu.create_test_device(),
                                     dataFlow=popart.DataFlow(1, [loss]),
                                     loss=loss,
                                     optimizer=optimizer,
                                     userOptions=opts)
    session.prepareDevice()
Exemplo n.º 12
0
 def adj_lr(self, lr, sess, specific_dic={}):
     self.opti_cfg['defaultLearningRate'] = (lr, False)
     new_optimizer = popart.SGD(self.opti_cfg)
     for name in specific_dic:
         new_optimizer.insertSpecific(name, specific_dic[name])
     sess.updateOptimizerFromHost(new_optimizer)
     self.gc_optimizer = new_optimizer
Exemplo n.º 13
0
def test_auto_loss_scaling_and_continuous_update_pipelining():
    """
    Create a Session with automatic loss scaling and pipelining
    enabled, but gradient accumulation disabled, and see that an
    incompatibility error is thrown.
    """
    builder = popart.Builder()

    t0 = builder.addInputTensor("FLOAT", [2, 2])
    mm0 = builder.aiOnnx.matmul([t0, t0])
    loss = builder.aiGraphcore.identityloss([mm0])

    optimizer = popart.SGD({"lossScaling": (2, False)})

    builder.virtualGraph(mm0, 0)
    builder.virtualGraph(loss, 0)

    opts = popart.SessionOptions()
    opts.automaticLossScalingSettings.enabled = True
    opts.automaticLossScalingSettings.binEdgeLocation = 0.5
    opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2
    opts.enablePipelining = True
    opts.enableGradientAccumulation = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    with pytest.raises(popart.popart_exception) as e_info:
        session = popart.TrainingSession(builder.getModelProto(),
                                         deviceInfo=tu.create_test_device(2),
                                         dataFlow=popart.DataFlow(1, [loss]),
                                         loss=loss,
                                         optimizer=optimizer,
                                         userOptions=opts)
    assert e_info.value.args[0].endswith(
        "Automatic loss scaling is not currently supported when the 'enablePipelining' SessionOption is set to 'true', but the 'enableGradientAccumulation' SessionOption is set to 'false'"
    )
Exemplo n.º 14
0
def test_auto_loss_scaling_with_no_tracked_tensors():
    """
    Build a model with ops, the outputs of which the auto loss scale transform
    does not decide to 'track'. Observe an error from the auto loss scale
    transform
    """
    builder = popart.Builder()

    t0 = builder.addInputTensor("FLOAT", [2, 2])
    out = builder.aiOnnx.relu([t0])
    loss = builder.aiGraphcore.identityloss([out])

    optimizer = popart.SGD({"lossScaling": (2, False)})

    opts = popart.SessionOptions()
    opts.automaticLossScalingSettings.enabled = True
    opts.automaticLossScalingSettings.binEdgeLocation = 0.5
    opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2

    with pytest.raises(popart.popart_exception) as e_info:
        session = popart.TrainingSession(builder.getModelProto(),
                                         deviceInfo=tu.create_test_device(),
                                         dataFlow=popart.DataFlow(1, [loss]),
                                         loss=loss,
                                         optimizer=optimizer,
                                         userOptions=opts)
    assert e_info.value.args[0].endswith("No tracked tensors were found")
Exemplo n.º 15
0
def test_auto_loss_scaling_with_const_loss_scale_tensor():
    """
    Create a session with auto loss scaling enabled, and with an optimizer
    with a constant loss scale value. Observe an error from the auto loss
    scale transform
    """
    builder = popart.Builder()

    t0 = builder.addInputTensor("FLOAT", [2, 2])
    t1_data = np.random.rand(2, 2).astype(np.float32)
    t1 = builder.addInitializedInputTensor(t1_data)
    out = builder.aiOnnx.matmul([t0, t1])
    loss = builder.aiGraphcore.identityloss([out])

    makeLossScalingTensorConst = True
    optimizer = popart.SGD({"lossScaling": (2, makeLossScalingTensorConst)})

    opts = popart.SessionOptions()
    opts.automaticLossScalingSettings.enabled = True
    opts.automaticLossScalingSettings.binEdgeLocation = 0.5
    opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2

    with pytest.raises(popart.popart_exception) as e_info:
        session = popart.TrainingSession(builder.getModelProto(),
                                         deviceInfo=tu.create_test_device(),
                                         dataFlow=popart.DataFlow(1, []),
                                         loss=loss,
                                         optimizer=optimizer,
                                         userOptions=opts)
    assert e_info.value.args[0].endswith(
        "The optimizer must have non-const loss scaling")
def test_accumulator_tensor_location_settings_plus_override():
    # Check optimizer state tensor location settings work
    optimizer_with_state = popart.SGD({
        "defaultLearningRate": (0.1, True),
        "defaultMomentum": (0.0, False),
        "defaultWeightDecay": (0.0, False),
        "defaultDampening": (0.0, True)
    })
    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OffChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OnChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W1'],
             check_offchip=['Accl___W2', 'Accl___W0'])

    ir = get_ir(
        accumulator_tensor_location_settings=popart.TensorLocationSettings(
            popart.TensorStorage.OnChip, 0),
        tensor_location_setting_override={
            'Accl___W1': popart.TensorLocation(popart.TensorStorage.OffChip)
        },
        optimizer=optimizer_with_state)
    check_ir(ir,
             check_onchip=['Accl___W2', 'Accl___W0'],
             check_offchip=['Accl___W1'])
Exemplo n.º 17
0
def session(train=False, skip_execution=False, include_patterns=True, splits=1, outline=False, optim="Sgd"):
    proto, data, x, loss = model(splits=splits)
    patterns = popart.Patterns()
    patterns.enablePattern("TiedGatherPattern", include_patterns)
    patterns.enablePattern("TiedGatherAccumulatePattern", include_patterns)

    user_options = {
        "enableOutlining": outline,
        "enableGradientAccumulation": True,
        "accumulationFactor": 2,
        "accumulationAndReplicationReductionType": popart.ReductionType.Mean,
        "meanAccumulationAndReplicationReductionStrategy": popart.MeanReductionStrategy.Running
    }

    if optim == "Lamb":
        optimizer = popart.Adam({
            "defaultLearningRate": (0.1, False),
            "defaultWeightDecay": (0.1, True),
            "defaultBeta1": (0.1, True),
            "defaultBeta2": (0.1, True),
            "lossScaling": (20, True),
        }, mode=popart.AdamMode.LambNoBias)  # NoBias to increase the error of incorrect gradients
        user_options["optimizerStateTensorLocationSettings"] = popart.TensorLocationSettings(
            popart.TensorLocation(
                popart.TensorStorage.OffChip,
                popart.ReplicatedTensorSharding.On),
            0, 0)
        user_options["enableReplicatedGraphs"] = True
        user_options["replicatedGraphCount"] = 2
        ipus = 2
    else:
        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True),  # 0 dampening to increase the error of incorrect gradients
            "lossScaling": (20, True)})
        ipus = 1

    if train:
        return run_py(
            proto,
            data=data,
            outputs=x,
            loss=loss,
            optimizer=optimizer,
            patterns=patterns,
            user_options=user_options,
            skip_execution=skip_execution)
    else:
        return run_py(
            proto,
            data=data,
            outputs=x,
            patterns=patterns,
            user_options={
                "enableOutlining": outline,
                "constantWeights": False
            },
            skip_execution=skip_execution)
Exemplo n.º 18
0
def run_model(builder_fn,
              steps,
              seed,
              training=True,
              options=popart.SessionOptions()):
    """
    Helper function that runs a model and returns the anchors.

      builder_fn - a function that takes a PopART builder and returns a tuple
                   comprising a loss, a dictionary of inputs and a dictionary
                   that maps python variable names to PopART tensor IDs for
                   anchors.
      steps      - number of batches per step
      seed       - random seed to pass to the PopART session.

    Returns a named tuple with .anchors being the anchors and .seed being the
    seed used.
    """

    builder = popart.Builder()
    loss, inputs, random_outs = builder_fn(builder)
    dataFlow = popart.DataFlow(
        steps,
        {op[1]: popart.AnchorReturnType("ALL")
         for op in random_outs.items()})

    proto = builder.getModelProto()
    optimizer = popart.SGD({"defaultLearningRate": (0.1, True)})
    patterns = popart.Patterns()

    device = tu.create_test_device(1, pattern=popart.SyncPattern.Full)

    if training:
        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          dataFlow=dataFlow,
                                          userOptions=options,
                                          patterns=patterns,
                                          deviceInfo=device)

    session.prepareDevice()
    session.weightsFromHost()
    session.setRandomSeed(seed)
    anchors = session.initAnchorArrays()
    stepio = popart.PyStepIO(inputs, anchors)
    session.run(stepio)

    return Run(anchors=anchors,
               seed=seed,
               steps=steps,
               random_outs=random_outs)
Exemplo n.º 19
0
def train_builder(opts):
    builder, data, outputs, loss, __ = eval_builder(opts)

    return [
        builder,
        data,
        outputs,
        loss,
        popart.SGD(0.01)
    ]
Exemplo n.º 20
0
    def create(self):
        self.iteration.learning_rate = self.option_values["defaultLearningRate"]

        if self.opt_type == "SGD":
            optimizer = popart.SGD(self.optimizer_options)
        elif self.opt_type == "ADAM":
            optimizer = popart.Adam(self.optimizer_options,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "ADAM_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.AdamNoBias,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "LAMB":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.Lamb,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)
        elif self.opt_type == "LAMB_NO_BIAS":
            optimizer = popart.Adam(self.optimizer_options,
                                    mode=popart.AdamMode.LambNoBias,
                                    accl1_type=self.accl1_type,
                                    scaled_optimizer_state=self.accl1_type == popart.DataType.FLOAT16)


        weight_decay_tensor_list = []

        for stage, tensors in self.tensors.items():
            for tensor_id in tensors:
                params = self.option_values.copy()

                if self.include_for_weight_decay(tensor_id):
                    params["weightDecay"] = self.weight_decay
                    weight_decay_tensor_list.append(tensor_id)
                else:
                    params["weightDecay"] = 0

                if self.disable_lamb(tensor_id):
                    params["maxWeightNorm"] = 0

                for transform in self.transforms:
                    params = transform(tensor_id, params, stage)

                specific_params = {
                    k: v for k, v in params.items() if k not in self.option_values
                }
                if specific_params:
                    p = self._make_tuple_options(specific_params)
                    optimizer.insertSpecific(tensor_id, p)

        if len(weight_decay_tensor_list) != 0:
            logger.debug(f" Weight decay of {self.weight_decay} applied to: {weight_decay_tensor_list}")

        return optimizer
Exemplo n.º 21
0
def test_incomplete_grad():

    # Reproducer for T37001, included as regression test. This test doesn't
    # actually check any assertions, it just ensure that a code path that
    # previously failed does not result in any exceptions.
    #
    # The problem originally revealed by this test was that an exception was
    # thrown if for some inputs of a fwd subgraph the backwards pass creator was
    # not able to create gradients for those inputs (for example for a seed
    # input). This problem was fixed in the code base by allowing subgraph
    # inputs in the fwd subgraph to not have an associated gradients outputs in
    # the associated bwd subgraph.

    def get_subgraph_builder(builder, weights, labels):

        subgraph_builder = builder.createSubgraphBuilder()
        subgraph_builder.addInputTensorFromParentGraph(weights)
        input = subgraph_builder.addInputTensor(
            popart.TensorInfo("FLOAT16", [4, 32, 1, 64]))
        subgraph_builder.addInputTensorFromParentGraph(labels)

        matmul_out = subgraph_builder.aiOnnx.matmul([input, weights])
        log_probs = subgraph_builder.aiOnnx.logsoftmax([matmul_out], axis=3)
        log_probs_compact = subgraph_builder.aiOnnx.gather([log_probs, labels],
                                                           axis=3)
        subgraph_builder.addOutputTensor(log_probs_compact)

        return subgraph_builder

    builder = popart.Builder()

    float16_input = builder.addInputTensor(
        popart.TensorInfo("FLOAT16", [4, 32, 1, 64]), "float16_input")
    int32_input = builder.addInputTensor(popart.TensorInfo("INT32", [4, 2]),
                                         "int32_input")
    weights = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                                "weights")

    fn = get_subgraph_builder(builder, weights, int32_input)
    log_probs_compact = builder.aiGraphcore.call(
        [weights, float16_input, int32_input], 1, fn)[0]
    l1_loss = builder.aiGraphcore.l1loss([log_probs_compact], 1.0)

    optimizer = popart.SGD({
        "defaultLearningRate": (0.1, False),
        "defaultWeightDecay": (0, True)
    })

    training_session = popart.TrainingSession(
        builder.getModelProto(),
        loss=l1_loss,
        deviceInfo=popart.DeviceManager().createIpuModelDevice({}),
        optimizer=optimizer,
        dataFlow=popart.DataFlow(1, {}),
        userOptions=popart.SessionOptions())
Exemplo n.º 22
0
    def create(self):
        self.iteration.learning_rate = self.optimizer_options[
            "defaultLearningRate"][0]

        optimizer = popart.SGD(self.optimizer_options)

        projection_scale_added = False

        for stage in self.pipeline_stage_tensors:
            specific_parameters = {}
            if self.lr_scaling:
                default_lr, lr_is_const = self.optimizer_options[
                    "defaultLearningRate"]
                specific_parameters["learningRate"] = (
                    default_lr * self.pipeline_stage_lr_scaling[stage],
                    lr_is_const)
            if self.momentum_scaling:
                # Momentum values are scaled inverse to the pipeline_stage
                if self.option_values["defaultMomentum"] != 0:
                    # This arithmetic will create FP rounding errors if momentum == 0.
                    momentum = 1 - (
                        (1 - self.option_values["defaultMomentum"]) *
                        self.pipeline_stage_momentum_scaling[stage])
                else:
                    momentum = 0
                specific_parameters["momentum"] = (momentum, True)

                if self.option_values["defaultDampening"] != 0:
                    dampening = 1 - (
                        (1 - self.option_values["defaultDampening"]) *
                        self.pipeline_stage_dampening_scaling[stage])
                else:
                    dampening = 0
                specific_parameters["dampening"] = (dampening, True)
            for tensor_id in self.pipeline_stage_tensors[stage]:
                # Special case for embedding/projection variable.
                if self.projection_lr_scaling and "Embedding_Dict" in tensor_id:
                    lr = specific_parameters.get(
                        "learningRate",
                        self.optimizer_options["defaultLearningRate"])
                    params = specific_parameters.copy()
                    params["learningRate"] = (lr[0] * self.projection_lr_scale,
                                              lr[1])
                    optimizer.insertSpecific(tensor_id, params)
                    projection_scale_added = True
                else:
                    optimizer.insertSpecific(tensor_id, specific_parameters)

        if self.projection_lr_scaling and not projection_scale_added:
            lr = self.optimizer_options["defaultLearningRate"]
            optimizer.insertSpecific(
                "Embedding/Embedding_Dict",
                {"learningRate": (lr[0] * self.projection_lr_scale, lr[1])})

        return optimizer
Exemplo n.º 23
0
    def run_test(aliaszerocopy):
        proto, data, x, loss = model()

        options = popart.SessionOptions()
        patterns = popart.Patterns()

        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
            "defaultMomentum": (0.9, True),
            "defaultDampening": (0, True)
        })

        options.enableOutlining = True
        options.outlineThreshold = -np.inf
        options.enableOutliningCopyCostPruning = False
        options.autoRecomputation = popart.RecomputationType.Standard
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.explicitRecomputation = True
        options.aliasZeroCopy = aliaszerocopy
        options.executionPhaseSettings.phases = 5
        request_ipus = 2

        device = tu.create_test_device(2, pattern=popart.SyncPattern.Full)

        dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")})

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        stepio = popart.PyStepIO(data, anchors)

        session.run(stepio)

        file_path = str(tmpdir / f"aliaszerocopy_model_test.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

        device.detach()

        graph_report = json.loads(session.getGraphReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        return anchors[x], post_proto, total_memory
    def run_test(outlining):
        proto, data, x, loss = model()

        options = popart.SessionOptions()
        patterns = popart.Patterns()

        optimizer = popart.SGD({
            "defaultLearningRate": (0.1, True),
        })

        options.enableOutlining = outlining
        options.outlineThreshold = 10.0
        options.enableGradientAccumulation = True
        options.accumulationFactor = 4
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = 2
        options.virtualGraphMode = popart.VirtualGraphMode.Manual
        if pipeline:
            options.enablePipelining = True
            options.autoRecomputation = popart.RecomputationType.Pipeline

        device = tu.create_test_device(4)

        dataFlow = popart.DataFlow(1, {x: popart.AnchorReturnType("ALL")})

        session = popart.TrainingSession(fnModel=proto,
                                         dataFlow=dataFlow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns,
                                         deviceInfo=device)

        session.prepareDevice()

        session.weightsFromHost()

        anchors = session.initAnchorArrays()

        stepio = popart.PyStepIO(data, anchors)

        session.run(stepio)

        file_path = str(tmpdir / f"outlining_execution_context_model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

        device.detach()

        graph_report = json.loads(session.getGraphReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        return session, anchors[x], post_proto, total_memory
Exemplo n.º 25
0
 def createOptimizer(self):
     if not isinstance(self.optimizer, torch.optim.SGD):
         raise RuntimeError("PopART currently only accepts SGD optimizers.")
     elif self.optimizer.defaults["nesterov"]:
         raise RuntimeError("Nesterov momentum is currently not supported.")
     return popart.SGD({
         "defaultLearningRate": (self.optimizer.defaults["lr"], False),
         "defaultMomentum": (self.optimizer.defaults["momentum"], False),
         "defaultWeightDecay":
         (self.optimizer.defaults["weight_decay"], False),
         "defaultDampening": (self.optimizer.defaults["dampening"], False)
     })
Exemplo n.º 26
0
def test_sgd_with_float16_model():
    popart.getLogger().setLevel("TRACE")

    input1 = np.zeros((2, 2, 4, 4), dtype=np.float16)
    input2 = np.zeros((2, 2, 3, 3), dtype=np.float16)
    input3 = np.zeros((2, 2, 3, 3), dtype=np.float16)

    builder = popart.Builder()
    inid1 = builder.addInputTensor(popart.TensorInfo(input1))
    inid2 = builder.addInitializedInputTensor(input2)
    inid3 = builder.addInitializedInputTensor(input2)

    c1 = builder.aiOnnx.conv([inid1, inid2],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1])
    c2 = builder.aiOnnx.conv([c1, inid3],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1])

    # Reduce to scalar
    out = builder.aiGraphcore.identityloss([c2])

    proto = builder.getModelProto()

    optimizer = popart.SGD({
        "defaultLearningRate": (0.1, False),
        "defaultWeightDecay": (0.1, False),
        "lossScaling": (1000, False)
    })

    anchorNames = {
        popart.reservedGradientPrefix() + inid1:
        popart.AnchorReturnType("All"),
    }

    opts = popart.SessionOptions()

    session = popart.TrainingSession(
        fnModel=proto,
        dataFlow=popart.DataFlow(1, anchorNames),
        loss=out,
        optimizer=optimizer,
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    session.prepareDevice()
    session.weightsFromHost()

    anchorArrays = session.initAnchorArrays()

    stepio = popart.PyStepIO({inid1: input1}, anchorArrays)
    session.run(stepio)
Exemplo n.º 27
0
def test_postnrepl_overzealous_elimination():

    # Reproducer for T36270, included as regression test. This test doesn't
    # actually do any assertions, it just checks that a code path that
    # previously failed does not result in any exceptions.
    #
    # The bug was that the PostNRepl pattern removed the gradient sum op that
    # produces Gradient_<in0> (which has 1 input) in the backwards subgraph, also
    # rewriting the subgraph itself to use the input to the gradient sum op
    # instead, as it's identical. However, the tensor produced by the op is a
    # graph output that is used by a call op in the main graph. The pattern did
    # not adjust this CallOp or the subgraph's output tensors and so the CallOp
    # in the main graph fails because it's using a tensor that no longer exists.

    def get_subgraph_builder(b, w):
        builder = b.createSubgraphBuilder()
        builder.addInputTensorFromParentGraph(w)

        in0 = builder.addInputTensor(
            popart.TensorInfo("FLOAT16", [4, 32, 16, 64]))

        x = builder.aiOnnx.matmul([in0, w])

        builder.addOutputTensor(x)
        return builder

    # building model and dataflow
    builder = popart.Builder()

    in0 = builder.addInputTensor(popart.TensorInfo('FLOAT16', [4, 32, 1, 64]),
                                 "in0")
    w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                          "weights")

    fn = get_subgraph_builder(builder, w)
    x = builder.aiGraphcore.call([w, in0], 1, fn)[0]
    l1_loss = builder.aiGraphcore.l1loss([x], 1.0)

    optimizer = popart.SGD({
        "defaultLearningRate": (0.1, False),
        "defaultWeightDecay": (0, True)
    })
    device = popart.DeviceManager().createIpuModelDevice({})

    # create training session
    popart.TrainingSession(fnModel=builder.getModelProto(),
                           loss=l1_loss,
                           deviceInfo=device,
                           optimizer=optimizer,
                           dataFlow=popart.DataFlow(1, {}),
                           userOptions=popart.SessionOptions())
Exemplo n.º 28
0
def test_gradient_accumulation_model_proto(tmpdir, explicit_loops):
    np.random.seed(1234)
    label_array = np.random.randint(0, hidden_size, batch_size)
    accl_initial_proto, accl_proto_filename, accl_anchor_arrays = run_mm_graph(
        # Using Momentum to create accl tensors.
        popart.SGD({
            "defaultLearningRate": (0.1, False),
            "defaultMomentum": (0.9, True)
        }),
        label_array=label_array,
        accum_factor=4,
        enable_accum=True,
        batches_per_step=5,
        number_of_steps=3,
        final_proto_filename=os.path.join(tmpdir, "accl5batches3steps"),
        enable_multi_ipu=False,
        full_anchorage=False,
        explicit_loops=explicit_loops)

    model = onnx.load(accl_proto_filename)
    names = [t.name for t in model.graph.initializer]

    grad_accl_names = []
    weight_names = []
    for name in names:
        if grad_accl_prefix in name:
            grad_accl_names.append(name)
        elif "weight" in name:
            weight_names.append(name)

    # Model should have 6 weight tensors
    assert len(weight_names) == 6
    assert len(grad_accl_names) == len(weight_names)

    tensor_mapping = {}
    for tensor in model.graph.initializer:
        tensor_mapping[tensor.name] = tensor

    rev_map = {}
    for w_name in weight_names:
        assert grad_accl_prefix + w_name in grad_accl_names
        rev_map[grad_accl_prefix + w_name] = w_name

    for g_a_name in grad_accl_names:
        weight_tensor = tensor_mapping[rev_map[g_a_name]]
        g_a_tensor = tensor_mapping[g_a_name]
        for d_i, v in enumerate(weight_tensor.float_data):
            # initialisation as per equations. When velocity scaling != 1 this
            # will need changing : T12001
            assert g_a_tensor.float_data[d_i] - v * wd < 1e-8
Exemplo n.º 29
0
def test_inplacing_phased_constraints(tmpdir):
    # This used to fail, see T23985
    run_model(tmpdir,
              'phased.onnx',
              execution_mode="phased",
              num_layers=5,
              optimizer=popart.SGD({
                  "defaultLearningRate": (0.1, True),
                  "defaultMomentum": (0.0, False),
                  "defaultWeightDecay": (0.0, False),
                  "defaultDampening": (0.0, True)
              }),
              activation_tensor_location_settings=offChipLocation,
              weight_tensor_location_settings=offChipLocation,
              optimizer_state_tensor_location_settings=offChipLocation,
              accumulator_tensor_location_settings=offChipLocation)
Exemplo n.º 30
0
def init_session(proto, loss, dataFlow, userOpts, device):
    # Create a session to compile and execute the graph
    optimizer = popart.SGD({"defaultLearningRate": (0.1, False)})
    session = popart.TrainingSession(fnModel=proto,
                                     loss=loss,
                                     deviceInfo=device,
                                     optimizer=optimizer,
                                     dataFlow=dataFlow,
                                     userOptions=userOpts)

    session.prepareDevice()
    session.setRandomSeed(42)

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    return Session(session, anchors), optimizer