def run_graph(optimizer, input_shape, initial_onnx_model, input_tensor_name,
              output_tensor_name, label_tensor_name, label_array, accum_factor,
              enable_accum, batches_per_step, number_of_steps,
              final_proto_filename, enable_multi_ipu, full_anchorage,
              inference_mode):

    art = popart.AnchorReturnType("All")
    anchorNames = {output_tensor_name: art}

    if full_anchorage:
        w0 = onnx.load_from_string(
            initial_onnx_model).graph.initializer[0].name

        anchorNames[popart.reservedGradientPrefix() + w0] = art

        if enable_accum:
            anchorNames[popart.reservedAcclPrefix() + w0] = art
            anchorNames[popart.reservedAcclToUpdatePrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.enableGradientAccumulation = enable_accum
    opts.accumulationFactor = accum_factor
    opts.enableOutlining = False
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual if enable_multi_ipu else popart.VirtualGraphMode.Off

    if enable_multi_ipu:
        device = tu.create_test_device(numIpus=num_ipus,
                                       tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    else:
        device = tu.create_test_device(tilesPerIPU=testTilesPerIPU,
                                       opts={"compileIPUCode": False})
        opts.virtualGraphMode = popart.VirtualGraphMode.Off

    # only for test purposes, inference with gradient_accumulation should never work
    if inference_mode:
        popart.InferenceSession(fnModel=initial_onnx_model,
                                dataFlow=popart.DataFlow(
                                    batches_per_step, anchorNames),
                                userOptions=opts,
                                deviceInfo=device)

    session = popart.TrainingSession(fnModel=initial_onnx_model,
                                     dataFlow=popart.DataFlow(
                                         batches_per_step, anchorNames),
                                     deviceInfo=device,
                                     loss=output_tensor_name,
                                     optimizer=optimizer,
                                     userOptions=opts)

    session.prepareDevice()
    session.weightsFromHost()

    anchor_arrays = session.initAnchorArrays()

    outer_dim = 1
    if batches_per_step > 1:
        outer_dim *= batches_per_step
        label_array = np.repeat(label_array[np.newaxis], batches_per_step, 0)
    if accum_factor > 1:
        outer_dim *= accum_factor
        label_array = label_array.reshape(
            [accum_factor * batches_per_step, -1])
    if outer_dim > 1:
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_tensor_name:
            (1.0 - xi * npr.rand(*input_shape)).astype(np.float32),
            label_tensor_name:
            label_array.astype(np.int32)
        }, anchor_arrays)

    for i in range(number_of_steps):
        session.run(stepio)

    final_proto_file = "{}.onnx".format(final_proto_filename)
    session.modelToHost(final_proto_filename)

    return final_proto_filename, anchor_arrays
예제 #2
0
        graph_transformer = popart.GraphTransformer(export_name)

        inputShapeInfo = popart.InputShapeInfo()
        inputShapeInfo.add("data",
                           popart.TensorInfo("FLOAT", [n, nTracks, nFeatures]))
        inputShapeInfo.add("init_hc",
                           popart.TensorInfo("FLOAT", [1, n, nHidden]))

        anchors = {"tag": popart.AnchorReturnType("ALL")}
        dataFeed = popart.DataFlow(1, anchors)
        # device = popart.DeviceManager().createIpuModelDevice({})
        device = popart.DeviceManager().acquireAvailableDevice(1)

        session = popart.InferenceSession(
            graph_transformer.getModelProto(),
            dataFeed,
            device,
            inputShapeInfo=inputShapeInfo,
        )

        session.prepareDevice()

        inferenceAnchors = session.initAnchorArrays()

        data_input = np.random.rand(n, nTracks, nFeatures).astype(np.float32)
        init_hc = np.zeros([1, n, nHidden]).astype(np.float32)

        stepio = popart.PyStepIO({
            "data": data_input,
            "init_hc": init_hc
        }, inferenceAnchors)
예제 #3
0
def get_model_anchors_model2(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             returnRawInput=False,
                             labelArray=None):

    np.random.seed(1234)
    builder = popart.Builder()
    micro_batch_size = batch_size // gradAcclFactor

    shape_d0 = [micro_batch_size, 2, 4, 4]
    shape_l0 = [batch_size]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp")
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0, "weights")

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")

    label_shape = [micro_batch_size]
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape),
                                "label")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        label = label.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        shape_d0.insert(0, outer_dim)
    data = np.ones(shape=shape_d0).astype(np.float32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    for i in range(6):
        session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
예제 #4
0
def build_and_run_graph(data_size):
    # Create a builder object:
    builder = popart.Builder()

    # Specify two input vectors:
    data_spec = popart.TensorInfo("FLOAT", [data_size])
    id_a = builder.addInputTensor(data_spec)
    id_b = builder.addInputTensor(data_spec)

    # Describe the computation:
    o1 = builder.aiOnnx.add([id_a, id_b])
    o2 = builder.aiOnnx.mul([id_a, id_b])

    # Designate the two output vectors and how
    # often the result will be required:
    builder.addOutputTensor(o1)
    builder.addOutputTensor(o2)
    data_flow = popart.DataFlow(
        1, {o1: popart.AnchorReturnType("ALL"), o2: popart.AnchorReturnType("ALL")}
    )

    # Setup an inference graph:
    proto = builder.getModelProto()
    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=data_flow,
        deviceInfo=popart.DeviceManager().createIpuModelDevice({}),
    )

    # Compile graph:
    session.prepareDevice()

    # Create input data buffers:
    data_a = np.random.rand(data_size).astype(np.float32)
    data_b = np.random.rand(data_size).astype(np.float32)
    inputs = {id_a: data_a, id_b: data_b}

    # Create output data buffers:
    anchors = session.initAnchorArrays()

    # Create timer objects and dictionaries:
    timer = PerfIntervalTimer()
    rtts = {}

    # Input callback is called when the data is needed:
    def input_callback(id, is_prefetch: bool):
        if is_prefetch:
            return

        if timer.not_set():
            timer.reset()
        return inputs[id]

    # Called after the input buffer has been consumed:
    def input_complete_callback(id):
        return

    # Output callback is called when a buffer is needed for the result:
    def output_callback(id):
        return anchors[id]

    # Complete callback is called when the output buffer has
    # been filled (result is ready to be consumed by the host):
    def output_complete_callback(id):
        rtt = timer.interval()
        rtts[id] = rtt

    # Create the callback IO system:
    stepio = popart.PyStepIOCallback(
        input_callback,
        input_complete_callback,
        output_callback,
        output_complete_callback,
    )

    # Run the graph and return timings:
    session.run(stepio, 'AddAndMulCallback')
    print(rtts)
    return rtts
def test_train_then_infer_via_file():

    builder = popart.Builder()

    input_shape = popart.TensorInfo("FLOAT", [1, 2, 4, 4])
    weight_shape = popart.TensorInfo("FLOAT", [3, 2, 3, 3])

    weight_data = np.ones([3, 2, 3, 3], np.float32)
    input = builder.addInputTensor(input_shape)
    weights = builder.addInitializedInputTensor(weight_data)
    act = builder.aiOnnx.conv([input, weights],
                              dilations=[1, 1],
                              pads=[1, 1, 1, 1],
                              strides=[1, 1])
    o = builder.aiOnnx.relu([act])

    l1 = builder.aiGraphcore.l1loss([o], 0.1)

    anchor_names = [
        o,
        popart.reservedGradientPrefix() + input,
        popart.reservedGradientPrefix() + weights
    ]
    training_dataFlow = popart.DataFlow(
        1, {
            anchor_names[0]: popart.AnchorReturnType("All"),
            anchor_names[1]: popart.AnchorReturnType("All"),
            anchor_names[2]: popart.AnchorReturnType("All")
        })

    opts = popart.SessionOptions()
    opts.constantWeights = False  # Allow the weights to be updated

    # ----------------------------------------------

    # Create the device
    device = tu.create_test_device(1, opts={"compileIPUCode": True})
    device.attach()

    # ----------------------------------------------

    # Prepare the input data
    input_data = np.ones(input_shape.shape(), dtype=np.float32)

    # ----------------------------------------------

    # Prepare the Inference session
    inference_dataFlow = popart.DataFlow(1,
                                         {o: popart.AnchorReturnType("All")})

    inference_session = popart.InferenceSession(
        fnModel=builder.getModelProto(),
        dataFlow=inference_dataFlow,
        userOptions=opts,
        deviceInfo=device)

    # Compile the inference graph
    inference_session.prepareDevice()

    # ----------------------------------------------

    # Prepare the Training session
    training_session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                              dataFlow=training_dataFlow,
                                              loss=l1,
                                              optimizer=popart.ConstSGD(0.01),
                                              userOptions=opts,
                                              deviceInfo=device)

    # Compile the training graph
    training_session.prepareDevice()

    # ----------------------------------------------

    # Run the training session
    training_session.weightsFromHost()

    training_anchors = training_session.initAnchorArrays()
    training_inputs = {input: input_data}

    for i in range(4):
        training_session.run(popart.PyStepIO(training_inputs,
                                             training_anchors))

    # Save the trained weights
    training_session.modelToHost("test.onnx")

    # ----------------------------------------------

    # Run the inference session
    ## Load the updated weights from the training session
    inference_session.resetHostWeights("test.onnx")
    inference_session.weightsFromHost()

    inference_anchors = inference_session.initAnchorArrays()
    inference_inputs = {input: input_data}

    inference_session.run(popart.PyStepIO(inference_inputs, inference_anchors))
예제 #6
0
def sparse_mm_infer(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size,
                    sparsity_level, transpose_rhs, memory_cycle_ratio,
                    inner_group_size):
    """ """
    if transpose_rhs:
        matmul_dims = [
            lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2]
        ]
    else:
        matmul_dims = [
            lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1]
        ]

    lhs = create_dense_matrix(lhs_dims)
    if sparse_mm_type == g_sparseMatMulTypeLookup[
            'DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix(
            vanilla_rhs_dims, block_size[1:], sparsity_level)

        rhs = bsr_rhs
        rhs_dims = bsr_rhs.shape
    elif sparse_mm_type == g_sparseMatMulTypeLookup[
            'DENSE_LHS_DENSE_RHS_SPARSE_OUT']:
        output_dims = lhs_dims[:-1]
        output_dims.append(vanilla_rhs_dims[-1])
        output_block_size = [block_size[0], block_size[2]]

        bsr_output, lengths_per_2d_plane, _, sparsity_mask = create_sparse_matrix(
            output_dims, output_block_size, sparsity_level)

        rhs_dims = vanilla_rhs_dims
        rhs = create_dense_matrix(rhs_dims)

    # Create a builder and construct a graph
    builder = popart.Builder()

    lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims)
    rhs_tensorInfo = popart.TensorInfo("FLOAT", rhs_dims)

    lhsTensor = builder.addInputTensor(lhs_tensorInfo)
    rhsTensor = builder.addInputTensor(rhs_tensorInfo)

    outTensor = builder.customOp(opName="BSMatMul",
                                 opVersion=1,
                                 domain="ai.graphcore",
                                 inputs=[lhsTensor, rhsTensor],
                                 attributes={
                                     "bsr_rhs_lengths_per_2d_plane":
                                     lengths_per_2d_plane.tolist(),
                                     "matrix_dims":
                                     matmul_dims,
                                     "block_size":
                                     block_size,
                                     "sparsity_mask":
                                     sparsity_mask.tolist(),
                                     "bsmatmul_type":
                                     sparse_mm_type,
                                     "transpose_rhs":
                                     transpose_rhs,
                                     "memory_cycle_ratio":
                                     memory_cycle_ratio,
                                     "inner_group_size":
                                     inner_group_size,
                                     "in_type":
                                     g_input_data_type,
                                     "out_type":
                                     g_output_data_type,
                                     "pp_type":
                                     g_pp_data_type
                                 })[0]

    builder.addOutputTensor(outTensor)

    proto = builder.getModelProto()

    # Describe how to run the model
    dataFlow = popart.DataFlow(1, {outTensor: popart.AnchorReturnType("ALL")})

    # Create a session to compile and execute the graph
    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=dataFlow,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(1))

    # Compile graph
    session.prepareDevice()

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    rhs = np.array(rhs, dtype=g_input_data_type)

    stepio = popart.PyStepIO({lhsTensor: lhs, rhsTensor: rhs}, anchors)
    session.run(stepio)

    ipuOutput = anchors[outTensor]

    if sparse_mm_type == g_sparseMatMulTypeLookup[
            'DENSE_LHS_SPARSE_RHS_DENSE_OUT']:
        if transpose_rhs:
            transpose_indices = list(range(len(vanilla_rhs_dims)))
            transpose_indices[-2], transpose_indices[-1] = transpose_indices[
                -1], transpose_indices[-2]

            vanilla_rhs = vanilla_rhs.transpose(tuple(transpose_indices))
            goldOutput = mm(lhs, vanilla_rhs)
        else:
            goldOutput = mm(lhs, vanilla_rhs)
    else:
        assert len(lhs.shape) == len(rhs.shape)
        if (len(lhs.shape) == 2):
            lhs = np.expand_dims(lhs, 0)
            rhs = np.expand_dims(rhs, 0)

        mmOutput = mm(lhs, rhs)

        totalGroupDims = int(np.prod(lhs_dims[:-2]))

        num_rows_sparsity_mask_2d = output_dims[-2] // block_size[0]
        num_cols_sparsity_mask_2d = output_dims[-1] // block_size[2]

        assert sparsity_mask.shape == (totalGroupDims *
                                       num_rows_sparsity_mask_2d *
                                       num_cols_sparsity_mask_2d, )
        mmOutput = mmOutput.reshape(
            (totalGroupDims, lhs_dims[-2], rhs_dims[-1]))

        goldOutput = []
        for dim in range(totalGroupDims):
            offset = num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d
            mmOutput_2d = mmOutput[dim]
            sliced_sparsity_mask = sparsity_mask[dim * offset:dim * offset +
                                                 offset]

            for sparsity_mask_idx in range(len(sliced_sparsity_mask)):
                if sliced_sparsity_mask[sparsity_mask_idx]:
                    mmOutput_2d_row_start = (
                        sparsity_mask_idx //
                        num_cols_sparsity_mask_2d) * block_size[0]
                    mmOutput_2d_row_end = mmOutput_2d_row_start + block_size[0]

                    mmOutput_2d_col_start = (
                        sparsity_mask_idx %
                        num_cols_sparsity_mask_2d) * block_size[2]
                    mmOutput_2d_col_end = mmOutput_2d_col_start + block_size[2]

                    mmOutput_2d_sliced = mmOutput_2d[
                        mmOutput_2d_row_start:mmOutput_2d_row_end,
                        mmOutput_2d_col_start:mmOutput_2d_col_end]
                    goldOutput.append(
                        mmOutput_2d_sliced.reshape(block_size[0] *
                                                   block_size[2]))

        goldOutput = np.array(goldOutput)

    return ipuOutput, goldOutput
예제 #7
0
    def run(transposed):
        bsize = 8
        dsize = 10
        builder = popart.Builder()
        ip = builder.addInputTensor(
            popart.TensorInfo("FLOAT", [bsize, dsize, dsize]))
        if transposed:
            # Explicitly specify the batch dimension for init
            init = builder.aiGraphcore.init([dsize, dsize, bsize],
                                            popart.DataType.FLOAT,
                                            popart.InitType.Zero, 2)
        else:
            init = builder.aiGraphcore.init([bsize, dsize, dsize],
                                            popart.DataType.FLOAT,
                                            popart.InitType.Zero, 0)

        def add_layer(in_id):
            w = builder.addInitializedInputTensor(
                np.ones([dsize, dsize], np.float32))
            if transposed:
                inputs = [w, in_id]
            else:
                inputs = [in_id, w]
            matmul_id = builder.aiOnnx.matmul(inputs)
            return matmul_id

        if transposed:
            ip_t = builder.aiOnnx.transpose([ip])
        else:
            ip_t = ip
        m1 = add_layer(ip_t)
        init = builder.aiOnnx.add([init, m1])
        m2 = add_layer(m1)
        init = builder.aiOnnx.add([init, m2])
        m3 = add_layer(m2)
        init = builder.aiOnnx.add([init, m3])

        out = builder.aiGraphcore.l1loss([init], 0.1)
        builder.addOutputTensor(out)

        device = tu.create_test_device(1)

        dfAnchors = {out: popart.AnchorReturnType("All")}

        opts = popart.SessionOptions()
        opts.enableOutlining = True
        opts.batchSerializationSettings.factor = 4

        proto = builder.getModelProto()

        session = popart.InferenceSession(
            fnModel=proto,
            dataFlow=popart.DataFlow(1, dfAnchors),
            patterns=popart.Patterns(popart.PatternsLevel.All),
            userOptions=opts,
            deviceInfo=device)

        session.prepareDevice()
        session.weightsFromHost()
        anchors = session.initAnchorArrays()

        ip_data = np.ones((bsize, dsize, dsize), dtype=np.float32)
        stepio = popart.PyStepIO({ip: ip_data}, anchors)

        session.run(stepio)
예제 #8
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[str] = None,
           optimizer: Optional[popart.Optimizer] = None,
           patterns: Optional[popart.Patterns] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None,
           skip_execution: bool = False,
           execution_mode: str = 'DEFAULT',
           replication_factor: int = 1,
           replicated_weight_sharding: bool = False,
           num_reps: int = 1):
    outputs = make_tuple(outputs)

    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step, {output: popart.AnchorReturnType("ALL")
                           for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.reportOptions = {"showVarStorage": "true"}
    if replicated_weight_sharding:
        options.weightTensorLocationSettings.location.replicatedTensorSharding.On
        options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On
    if replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = replication_factor
    if execution_mode == 'PHASED':
        options.enableOutlining = True
        options.outlineThreshold = -np.inf
        options.enableOutliningCopyCostPruning = False
        options.autoRecomputation = popart.RecomputationType.Standard
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.explicitRecomputation = True
        options.aliasZeroCopy = True
        options.batchSerializationSettings.factor = user_options[
            "batchSerializationFactor"]
        options.executionPhaseSettings.phases = user_options["executionPhases"]
        ipus = 2
    else:
        options.enableGroupedMatmuls = False
        options.enableStochasticRounding = False
        options.constantWeights = True
        options.outlineThreshold = 10.0
        if ipus is not None and ipus > 1:
            options.virtualGraphMode = popart.VirtualGraphMode.Manual
        else:
            ipus = 1

    for key, value in user_options.items():
        if key not in ["batchSerializationFactor", "executionPhases"]:
            setattr(options, key, value)

    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true",
            "opt.internalExchangeOptimisationTarget": "balanced",
        }

    request_ipus = pow(2, math.ceil(math.log2(ipus)))
    request_ipus *= replication_factor
    dm = popart.DeviceManager()
    dm.setOnDemandAttachTimeout(int(1e4))
    device = dm.acquireAvailableDevice(
        request_ipus,
        connectionType=popart.DeviceConnectionType.OnDemand,
        selectionCriterion=popart.DeviceSelectionCriterion.Random)
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFlow=data_flow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=data_flow,
                                          userOptions=options,
                                          patterns=patterns)

    if skip_execution:
        device.detach()
        return session

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.OutOfMemoryException as e:
        if return_stats and log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            gcprofile.save_popart_report(session,
                                         log_dir=log_dir,
                                         exception=e)
        device.detach()
        raise e
    print("Compilation complete")

    session.weightsFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    # Add a gradient accumulation factor dimension if needed
    af = user_options.get("accumulationFactor")
    if af is not None and af > 1:
        data = {k: np.repeat(v[np.newaxis], af, 0)
                for k, v in data.items()}

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {k: np.repeat(v[np.newaxis], batches_per_step, 0)
                for k, v in data.items()}

    for _ in range(num_reps):
        stepio = popart.PyStepIO(data, anchors)
        session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto
                                                                                                     input_tensor = builder.addInputTensor(popart.TensorInfo("FLOAT", [input_len])) print("Shape of {}: {}".format(input_tensor, builder.getTensorShape(input_tensor)))

                                                                                                                                                                                                                                     output_tensor = builder.customOp(opName = "Rsqrt", opVersion = 1, domain = "ai.graphcore", inputs =[input_tensor], attributes = {})[0]

                                                                                                                                                                                                                                                                      print("Inputs: {}".format(builder.getInputTensorIds())) print("Outputs: {}".format(builder.getOutputTensorIds())) print("Values: {}".format(builder.getValueTensorIds())) print("Shape of {}: {}".format(output_tensor, builder.getTensorShape(output_tensor)))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  builder.addOutputTensor(output_tensor)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              proto = builder.getModelProto()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  anchors = {output_tensor : popart.AnchorReturnType("FINAL") } dataFlow = popart.DataFlow(1, anchors)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  if run_on_ipu : device = popart.DeviceManager().acquireAvailableDevice(1) print("IPU hardware device acquired") else : device = popart.DeviceManager().createIpuModelDevice({}) print("Running on IPU Model")

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            session = popart.InferenceSession(proto, dataFlow, device)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   session.prepareDevice() result = session.initAnchorArrays()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             X =(np.array(input_data)).astype(np.float32) print("X={}".format(X))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    stepio = popart.PyStepIO({input_tensor : X }, result) session.run(stepio)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      return result

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              def load_custom_ops_lib() : so_path = os.path.join(os.path.dirname(__file__), "build/custom_ops.so")

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                if not os.path.isfile(so_path) : print("Build the custom ops library with `make` before running this script") exit(1)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ctypes.cdll.LoadLibrary(so_path)
예제 #10
0
def run(benchmark, opts):
    proto, data, outputs, losses, optimizer = benchmark.graph_builder(opts)

    if opts.save_graph:
        with open('model.onnx', "wb") as f:
            f.write(proto)
            print("Written to file: model.onnx")

    dataFlow = popart.DataFlow(opts.batches_per_step, outputs)

    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if not opts.use_data:
        options.syntheticDataMode = popart.SyntheticDataMode.Zeros
    options.instrumentWithHardwareCycleCounter = opts.report_hw_cycle_count
    options.engineOptions = {
        "debug.instrumentCompute": "true" if opts.report else "false"
    }
    if opts.convolution_options:
        options.convolutionOptions = json.loads(opts.convolution_options)

    if opts.shards > 1:
        if opts.auto_sharding:
            options.virtualGraphMode = popart.VirtualGraphMode.Auto
        else:
            options.virtualGraphMode = popart.VirtualGraphMode.Manual

    options.enablePipelining = opts.pipeline

    # Select a device
    deviceManager = popart.DeviceManager()
    if opts.simulation:
        deviceOptions = {"compileIPUCode": True,
                         'numIPUs': opts.shards, "tilesPerIPU": 1216}
        device = deviceManager.createIpuModelDevice(deviceOptions)
    else:
        device = deviceManager.acquireAvailableDevice(opts.shards)
        if device is None:
            raise OSError("Failed to acquire IPU.")

    if opts.mode == 'train':
        session = popart.TrainingSession(fnModel=proto,
                                         loss=losses,
                                         deviceInfo=device,
                                         optimizer=optimizer,
                                         dataFlow=dataFlow,
                                         userOptions=options)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=dataFlow,
                                          userOptions=options)

    print("Compiling...")
    start = time.time()
    session.prepareDevice()
    compilation_duration = time.time() - start
    print("Duration: {:.3f} seconds\n".format(compilation_duration))

    if opts.tensor_tile_mapping:
        with open("tile_mapping.json", 'w') as f:
            json.dump(session.getTensorTileMap(), f)
            print("Written to file: tile_mapping.json")

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    # Copy weights and optimization parameters onto the device
    session.weightsFromHost()

    # Add a batches_per_step dimension if needed
    if opts.batches_per_step > 1:
        data = {k: np.repeat(v[np.newaxis], opts.batches_per_step, 0)
                for k, v in data.items()}

    stepio = popart.PyStepIO(data, anchors)

    print("Executing...")
    average_batches_per_sec = 0
    # Steps
    for __ in range(opts.steps):
        # Run
        start = time.time()
        session.run(stepio)
        duration = time.time() - start

        if opts.report:
            return save_reports(opts, session)

        average_batches_per_sec += (opts.batches_per_step /
                                    duration)/opts.steps
        report_string = "{:<8.3} sec/itr.".format(duration)
        report_string += "   " + benchmark.iteration_report(opts, duration)
        print(report_string)

    if opts.report_hw_cycle_count:
        print("Hardware cycle count per 'run':", session.getCycleCount())

    return compilation_duration, average_batches_per_sec
예제 #11
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      replicated_graph_count=1,
                      doProfiling=False,
                      doDropout=False,
                      doGradientAccl=False,
                      acclSteps=1,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 16
    microBatchSize = batchSize // acclSteps

    shape_d0 = [microBatchSize, 2, 4, 4]
    shape_l0 = [microBatchSize]

    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32])
    if doDropout:
        do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0]
        out = builder.aiOnnx.softmax([do0], axis=1, debugPrefix="sfm")
    else:
        out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0],
                                      reduction=popart.ReductionType.Sum)

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradientAccl
    opts.accumulationFactor = acclSteps
    opts.enableStochasticRounding = False

    if doSharding is False:
        numIpus = 1 * replicated_graph_count
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIpus = 2 * replicated_graph_count
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 0)
        builder.virtualGraph(c0, 0)
        builder.virtualGraph(r0, 1)
        if doDropout:
            builder.virtualGraph(do0, 1)
        builder.virtualGraph(out, 1)
        builder.virtualGraph(nll, 1)

    if replicated_graph_count > 1:
        opts.replicatedGraphCount = replicated_graph_count
        opts.enableReplicatedGraphs = True

    device = tu.create_test_device(numIpus=numIpus)

    if doTraining is True:
        session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                         dataFlow=popart.DataFlow(
                                             batchesPerStep, anchor_map),
                                         loss=nll,
                                         optimizer=popart.ConstSGD(0.01),
                                         userOptions=opts,
                                         deviceInfo=device)
    else:
        session = popart.InferenceSession(fnModel=builder.getModelProto(),
                                          dataFlow=popart.DataFlow(
                                              batchesPerStep, anchor_map),
                                          userOptions=opts,
                                          deviceInfo=device)

    if doDevicex is False:
        return None

    session.prepareDevice()
    anchors = session.initAnchorArrays()
    session.setRandomSeed(0)

    classes = np.prod(shape_d0) // (batchSize * batchesPerStep)

    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    # With all options enabled return anchors are of the shape:
    # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape]
    if acclSteps > 1:
        shape_d0.insert(0, acclSteps)
        label = label.reshape([acclSteps, -1])
    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)

    data = np.random.random_sample(shape_d0).astype(np.float32)

    # This is a slightly odd case - we want the same data to be input for both
    # replicated graphs, but the dimension we need to repeat on is either the
    # first or second (the replication dimension) depending on whether we
    # have gradient accumulation enabled.
    # If we are not testing, this is a lot simpler as we can split samples however
    # we want.
    if replicated_graph_count > 1:
        if acclSteps > 1:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 2)
            label = label.reshape([replicated_graph_count, -1])
        else:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 1)
            label = label.reshape([replicated_graph_count, -1])

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)
    stepio.enableRuntimeAsserts(False)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
예제 #12
0
        opts.executionPhaseSettings.stages = 2
        opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        opts.numIOTiles = 128

        varLocation = popart.TensorLocation()
        varLocation.storage = popart.TensorStorage.OffChip
        varLocation.loadTileSet = popart.TileSet.IO
        varLocation.storageTileSet = popart.TileSet.IO
        opts.weightTensorLocationSettings.location = varLocation
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    print("Compiling.")
    session = popart.InferenceSession(fnModel=proto,
                                      dataFlow=popart.DataFlow(
                                          args.batches_per_step, anchor_map),
                                      userOptions=opts,
                                      deviceInfo=device)

    session.prepareDevice()
    session.weightsFromHost()
    anchors = session.initAnchorArrays()

    print("Running.")
    for i in range(args.iters):
        input_data = np.random.rand(args.batches_per_step, args.batch_size,
                                    args.dsize, args.dsize).astype(dtype)
        stepio = popart.PyStepIO({input_id: input_data}, anchors)
        start = time.time()
        session.run(stepio)
        duration = time.time() - start
예제 #13
0
        timePerEvent = elapsed / n

        print("{:.12f}".format(elapsed))

    else:

        # POPART IMPORT

        graph_transformer = popart.GraphTransformer(onnx_model)

        anchors = {"tag": popart.AnchorReturnType("ALL")}
        dataFeed = popart.DataFlow(1, anchors)
        device = popart.DeviceManager().acquireAvailableDevice(1)

        session = popart.InferenceSession(
            graph_transformer.getModelProto(), dataFeed, device
        )

        session.prepareDevice()

        inferenceAnchors = session.initAnchorArrays()

        inputs = np.random.rand(n, nFeatures, nTracks, 1).astype(np.float32)

        stepio = popart.PyStepIO({"data": inputs}, inferenceAnchors)

        # for i in range(10): session.run(stepio)

        start = time.perf_counter()

        session.run(stepio)
예제 #14
0
        def run(self,
                init_builder,
                reference,
                step_type='infer',
                opsets=None,
                optimizer=popart.ConstSGD(0.01),
                seed=None):
            assert step_type in ('infer', 'train')

            bld = Builder(opsets=opsets, check_model=self.check_model)

            anchors = {}

            # Allows to pass additional arguments to init_builder, if required
            # by the specific init_builder function implementation.
            kwargs = {}
            kwargs = tu.filter_dict(kwargs, init_builder)
            anchorIds = init_builder(bld, **kwargs)

            for anchorId in anchorIds:
                if anchorId not in bld._init_input_map:
                    anchors[anchorId] = popart.AnchorReturnType("All")

            dataFlow = popart.DataFlow(1, anchors)

            self.options.logDir = self.logging_dir

            if self.tilesPerIPU is not None:
                device = tu.create_test_device(numIpus=self.numIPUs,
                                               tilesPerIPU=self.tilesPerIPU)
                print(f"Created device {device} with {self.numIPUs}"
                      f" IPUs and {self.tilesPerIPU} tiles per IPU")
            else:
                device = tu.create_test_device(numIpus=self.numIPUs)
                print(f"Created device {device} with {self.numIPUs} IPUs")

            self.patterns.InPlace = self.inplacing
            if step_type == 'infer':
                session = popart.InferenceSession(fnModel=bld.getModelProto(),
                                                  dataFlow=dataFlow,
                                                  deviceInfo=device,
                                                  patterns=self.patterns,
                                                  userOptions=self.options)
            else:
                assert step_type == 'train'
                # Apply reduction to output (assumed to be the
                # first anchorId) to ensure it is scalar
                lossId = anchorIds[0]
                lossId = bld.aiGraphcore.identityloss(
                    [lossId], reduction=self.lossReduction)

                session = popart.TrainingSession(fnModel=bld.getModelProto(),
                                                 dataFlow=dataFlow,
                                                 loss=lossId,
                                                 optimizer=optimizer,
                                                 deviceInfo=device,
                                                 patterns=self.patterns,
                                                 userOptions=self.options)

            anchor_map = session.initAnchorArrays()

            session.prepareDevice()

            if seed is not None:
                session.setRandomSeed(seed)

            for k, v in bld._input_map.items():
                if not v.flags['C_CONTIGUOUS']:
                    # need to call np.ascontiguousarray
                    # `x = np.ascontiguousarray(x)`
                    raise Exception(
                        'Input "{}" to popart.PyStepIO is not C_CONTIGUOS'.
                        format(k))

            # Add the replication dimension to the inputs
            inputs = {}
            for k, v in bld._input_map.items():
                if self.options.replicatedGraphCount > 1:
                    um = (self.options.replicatedGraphCount, )
                    um = um + tuple([1] * np.ndim(v))

                    # we add this offset to ensure that samples on devices are distinct
                    offset = 1 * np.arange(
                        self.options.replicatedGraphCount).astype(
                            v.dtype).reshape(um)

                    inputs[k] = np.tile(v, um) + offset

                else:
                    inputs[k] = v

            stepio = popart.PyStepIO(inputs, anchor_map)

            if (step_type == 'train'):
                session.weightsFromHost()

            session.run(stepio)

            if (step_type == 'train'):
                session.weightsToHost()

            ref_out = reference(RefData(bld._outputs, anchor_map))

            def fix_type(t):
                if isinstance(t, torch.Tensor):
                    return t.data.numpy()
                elif isinstance(t, np.ndarray):
                    return t
                elif isinstance(t, np.float32):
                    return t
                elif isinstance(t, np.float16):
                    return t
                elif isinstance(t, np.int32):
                    return t
                elif t is None:
                    return None
                else:
                    raise Exception('unexpected type', type(t))

            ref_out = [fix_type(i) for i in ref_out]
            for index, key in enumerate(anchorIds):
                if key in anchors:
                    if ref_out[index] is not None:
                        print('Testing anchor "{}"...'.format(key))
                        self.verifyTensor(anchor_map[key], ref_out[index])
                    else:
                        print('Not Testing anchor "{}" as it is None'.format(
                            key))
                elif key in bld._init_input_map:
                    if ref_out[index] is not None:
                        print('Testing weight "{}"...'.format(key))
                        weightInfo = session.getInfo(key)
                        print('Weight info shape:{} type:{}',
                              weightInfo.shape(), weightInfo.data_type_lcase())
                        weights = {}
                        weights[key] = np.empty(
                            shape=weightInfo.shape(),
                            dtype=weightInfo.data_type_lcase())
                        weightsIo = popart.PyWeightsIO(weights)
                        session.readWeights(weightsIo)

                        self.verifyTensor(weights[key], ref_out[index])

                    else:
                        print('Not Testing weight "{}" as it is None'.format(
                            key))

            return session
예제 #15
0
def test_auto_virtual_graph_subgraphs_2():

    ipus = 2

    popart.getLogger().setLevel("TRACE")

    builder = popart.Builder()

    input_shape = [1, 64]
    input1 = builder.addInputTensor(popart.TensorInfo("FLOAT16", input_shape))
    input2 = builder.addInputTensor(popart.TensorInfo("FLOAT16", input_shape))

    # Subgraph 0
    w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                          "TESTID-A")
    x0 = builder.aiOnnx.matmul([input1, w])
    w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                          "TESTID-B")
    x0 = builder.aiOnnx.matmul([x0, w])

    # Subgraph 1
    w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                          "TESTID-C")
    x1 = builder.aiOnnx.matmul([input2, w])

    # Subgraph 2
    x2 = builder.aiOnnx.add([x0, x1])
    w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16),
                                          "TESTID-D")
    x2 = builder.aiOnnx.matmul([x2, w])

    output = x2
    builder.addOutputTensor(output)

    # Desired split is:
    # ipu1: 0. ipu2: 1,2

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(1, {output: popart.AnchorReturnType("Final")})

    opts = popart.SessionOptions()
    opts.virtualGraphMode = popart.VirtualGraphMode.Auto

    device = tu.create_test_device(numIpus=ipus)

    session = popart.InferenceSession(fnModel=proto,
                                      dataFlow=dataFlow,
                                      userOptions=opts,
                                      deviceInfo=device)
    ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON))
    for op in ir["maingraph"]:
        ipu = op["attributes"]["__ipu_number"]
        for input in op["inputs"]:
            if ("TESTID-A" in input["name"]):
                assert (int(ipu) == 0)
            if ("TESTID-B" in input["name"]):
                assert (int(ipu) == 0)
            if ("TESTID-C" in input["name"]):
                assert (int(ipu) == 1)
            if ("TESTID-D" in input["name"]):
                assert (int(ipu) == 1)
예제 #16
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[str] = None,
           optimizer: Optional[popart.Optimizer] = None,
           patterns: Optional[popart.Patterns] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None,
           skip_execution: bool = False):
    outputs = make_tuple(outputs)

    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step, {output: popart.AnchorReturnType("ALL")
                           for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.enableGroupedMatmuls = False
    options.enableStochasticRounding = False
    options.constantWeights = True
    options.outlineThreshold = 10.0
    options.reportOptions = {
        "showVarStorage": "true"
    }
    if ipus is not None and ipus > 1:
        options.virtualGraphMode = popart.VirtualGraphMode.Manual
    else:
        ipus = 1
    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }
    for key, value in user_options.items():
        setattr(options, key, value)

    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }

    request_ipus = pow(2, math.ceil(math.log2(ipus)))
    device = popart.DeviceManager().acquireAvailableDevice(request_ipus)
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFlow=data_flow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=data_flow,
                                          userOptions=options,
                                          patterns=patterns)

    if skip_execution:
        device.detach()
        return session

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.OutOfMemoryException as e:
        if return_stats and log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            gcprofile.save_popart_report(session,
                                         log_dir=log_dir,
                                         exception=e)
        raise e
    print("Compilation complete")

    session.weightsFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {k: np.repeat(v[np.newaxis], batches_per_step, 0)
                for k, v in data.items()}

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto
예제 #17
0
def test_rng_set_and_get():
    """
    1. Create a training and validation session 
        with the option to enable rng set/get.
    2. Get the initial RNG state values
    3. Step 1 : do 5 runs of the training session, twice
    4. Step 2 : 
        - reset the RNG to the initial state
        - do 5 runs of the training session 
        - capture the rng state
        - do 1 run of the validation session
        - restore the rng
        - do 5 runs of the training session again
    5. Step 3 :
        - Reset the RNG to the initial state
        - do 5 runs of the training session, 
        - do 1 run of the validation session
        - do 5 runs of the training session again
    6. Results comparison:
        Steps 1 and 2 must have the same outputs
        after the series of 5 runs.
        Step 3 must have a different output after the second 
        series of 5 runs, due to session overwritting RNG state.
    """

    np.random.seed(0)

    # Model definition
    builder = popart.Builder()
    dShape = [100, 100]
    i0 = builder.addInputTensor(popart.TensorInfo("FLOAT16", dShape))
    wData = np.random.rand(*dShape).astype(np.float16)
    w0 = builder.addInitializedInputTensor(wData)
    out = builder.aiOnnx.matmul([i0, w0])
    loss = builder.aiGraphcore.l1loss([out], 0.1)

    device = tu.create_test_device(1)

    # Enable the options
    options = popart.SessionOptions()
    options.enableLoadAndOffloadRNGState = True
    options.enableStochasticRounding = True
    options.constantWeights = False
    options._enableRngStateManagement = True

    # Training session
    bps = 5
    tr_opt = popart.SGD({"defaultMomentum": (0.01, True)})
    session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                     dataFlow=popart.DataFlow(bps, [out]),
                                     loss=loss,
                                     optimizer=tr_opt,
                                     deviceInfo=device,
                                     userOptions=options)
    session.prepareDevice()
    anchors = session.initAnchorArrays()

    # Get the initial RNG state before any other operation.
    init_rng = session.getRNGState()

    # Interfering inference session
    interfering_session = popart.InferenceSession(
        fnModel=builder.getModelProto(),
        dataFlow=popart.DataFlow(bps, [out]),
        deviceInfo=device,
        userOptions=options)
    interfering_session.prepareDevice()
    inf_anchors = interfering_session.initAnchorArrays()

    # Input data
    data_a = np.random.rand(5, 100, 100).astype(np.float16)

    def run_session(session):
        stepio = popart.PyStepIO({i0: data_a}, anchors)
        session.run(stepio)
        return session.getRNGState(), anchors['MatMul:0'].tolist()

    def run_interference(interfering_session):
        interfering_session.weightsFromHost()
        inf_stepio = popart.PyStepIO({i0: data_a}, inf_anchors)
        interfering_session.run(inf_stepio)

    # Step 1 -> training, training
    session.weightsFromHost()
    session.setRNGState(init_rng)
    rng, pre1 = run_session(session)
    session.weightsFromHost()
    rng2, output1 = run_session(session)
    assert rng != rng2

    # Step 2 -> interleaved training, validation, training
    session.weightsFromHost()
    session.setRNGState(init_rng)
    rng, pre2 = run_session(session)
    run_interference(interfering_session)
    session.weightsFromHost()
    session.setRNGState(rng)
    rng2, output2 = run_session(session)
    assert output1 == output2

    # Step 3 -> interleaved training, validation, RNG not restored
    session.weightsFromHost()
    session.setRNGState(init_rng)
    rng, pre3 = run_session(session)
    run_interference(interfering_session)
    session.weightsFromHost()
    rng2, output3 = run_session(session)

    assert pre1 == pre2 == pre3
    assert (output3 != output1)

    # Small tests about the seed
    init_rng = session.getRNGState()

    # not all states are valid, but we don't check that as long as the size is correct
    new_rng = [k for k in range(len(init_rng))]
    session.setRNGState(new_rng)
    rng1 = session.getRNGState()
    assert (rng1 == new_rng)

    session.setRNGState(init_rng)
    rng2 = session.getRNGState()
    assert (rng2 == init_rng)

    # check that an RNGState of the wrong size raises an exception
    init_rng.append(0)
    with pytest.raises(popart.popart_exception) as e_info:
        session.setRNGState(init_rng)
    assert e_info.value.args[0].startswith(
        "Devicex::setRngStateValue received rngState of size")
예제 #18
0
def test_outlining_bca2():
    """
    In this test we check that the default behaviour is for matmul to be
    cached.
    """

    popart.getLogger().setLevel("TRACE")

    builder = popart.Builder()

    matmul_lhs_shape = popart.TensorInfo("FLOAT", [2, 3])
    matmul_rhs_shape = popart.TensorInfo("FLOAT", [3, 4])

    i1 = builder.addInputTensor(matmul_lhs_shape)
    i2 = builder.addInputTensor(matmul_rhs_shape)
    i3 = builder.addInputTensor(matmul_lhs_shape)
    i4 = builder.addInputTensor(matmul_rhs_shape)

    c1 = builder.aiOnnx.matmul([i1, i2])
    c2 = builder.aiOnnx.matmul([i3, i4])

    r1 = builder.aiOnnx.relu([c1])
    r2 = builder.aiOnnx.relu([c2])

    a1 = builder.aiOnnx.sum([r1, r2, c1, c2])

    c3 = builder.aiOnnx.matmul([i1, i2])
    c4 = builder.aiOnnx.matmul([i3, i4])

    r3 = builder.aiOnnx.relu([c3])
    r4 = builder.aiOnnx.relu([c4])

    a2 = builder.aiOnnx.sum([r3, r4, c3, c4])

    o = builder.aiOnnx.add([a1, a2])

    builder.addOutputTensor(o)

    proto = builder.getModelProto()

    anchor_names = [o]
    dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")})

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}

    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=dataFlow,
        userOptions=opts,
        deviceInfo=tu.create_test_device(opts={"compileIPUCode": False}))

    anchors = session.initAnchorArrays()

    session.prepareDevice()

    matmul1_lhs = np.ones(matmul_lhs_shape.shape(), dtype=np.float32)
    matmul1_rhs = np.ones(matmul_rhs_shape.shape(), dtype=np.float32)

    matmul2_lhs = np.ones(matmul_lhs_shape.shape(), dtype=np.float32)
    matmul2_rhs = np.ones(matmul_rhs_shape.shape(), dtype=np.float32)

    inputs = {
        i1: matmul1_lhs,
        i2: matmul1_rhs,
        i3: matmul2_lhs,
        i4: matmul2_rhs
    }
    stepio = popart.PyStepIO(inputs, anchors)

    session.run(stepio)

    # Check that there is only one convolution computation set.
    summaryReport = session.getSummaryReport()
    computeSets = tu.get_compute_sets_from_report(summaryReport)

    num_matmuls = tu.get_compute_set_regex_count(
        r'^.+/matmulGrouped/Conv_1/Convolve$', computeSets)
    # There should be only one matmul
    assert (num_matmuls == 1)
예제 #19
0
def sparse_softmax(dims, block_size, sparsity_level, inner_group_size):
    """ """

    sparse_input, lengths_per_2d_plane, dense_input, sparsity_mask = create_sparse_matrix(
        dims, block_size, sparsity_level, -1000)

    # Create a builder and construct a graph
    builder = popart.Builder()

    tensor_info = popart.TensorInfo("FLOAT", sparse_input.shape)
    input_tensor = builder.addInputTensor(tensor_info)

    output_tensor = builder.customOp(opName="BsSoftmax",
                                     opVersion=1,
                                     domain="ai.graphcore",
                                     inputs=[input_tensor],
                                     attributes={
                                         "matrixDims":
                                         dims,
                                         "blockSize":
                                         block_size,
                                         "sparsity":
                                         sparsity_mask.tolist(),
                                         "groupSizes":
                                         lengths_per_2d_plane.tolist(),
                                         "innerGroupSize":
                                         inner_group_size,
                                         "subBlockMaskPerGroup":
                                         "None" * len(lengths_per_2d_plane)
                                     })[0]
    builder.addOutputTensor(output_tensor)

    proto = builder.getModelProto()

    # Describe how to run the model
    dataFlow = popart.DataFlow(1,
                               {output_tensor: popart.AnchorReturnType("ALL")})

    # Create a session to compile and execute the graph
    session = popart.InferenceSession(
        fnModel=proto,
        dataFlow=dataFlow,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(1))

    # Compile graph
    session.prepareDevice()

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    sparse_input = np.array(sparse_input, dtype=g_input_data_type)
    stepio = popart.PyStepIO({input_tensor: sparse_input}, anchors)
    session.run(stepio)

    ipu_output = anchors[output_tensor]

    group_dims = dims[:-2]
    mat_dims = dims[-2:]
    blocks_2d = [mat_dims[0] // block_size[0], mat_dims[1] // block_size[1]]
    num_blocks_2d = blocks_2d[0] * blocks_2d[1]
    block_area = block_size[0] * block_size[1]

    total_group_dims = int(np.prod(group_dims))
    assert sparsity_mask.shape == (total_group_dims * num_blocks_2d, )

    cpu_output = softmax(dense_input)

    np.set_printoptions(precision=2)
    np.set_printoptions(suppress=True)

    cpu_output = cpu_output.reshape([
        total_group_dims, blocks_2d[0], block_size[0], blocks_2d[1],
        block_size[1]
    ])
    cpu_output = np.transpose(cpu_output, [0, 1, 3, 2, 4])
    cpu_output = cpu_output.reshape(total_group_dims, num_blocks_2d,
                                    block_area)

    gold_output = []
    offset = 0
    for g in range(total_group_dims):
        cpu_output_2d = cpu_output[g]

        sliced_sparsity_mask = sparsity_mask[offset:offset + num_blocks_2d]
        offset = offset + num_blocks_2d
        for sparsity_mask_idx in range(num_blocks_2d):
            if sliced_sparsity_mask[sparsity_mask_idx]:
                gold_output.append(cpu_output_2d[sparsity_mask_idx])

    gold_output = np.array(gold_output)
    assert ipu_output.shape == gold_output.shape

    return ipu_output, gold_output
예제 #20
0
for it in sess.get_inputs():
    space_input[it.name] = np.array([1.0] * np.product(it.shape),
                                    dtype=np.float32)
for it in sess.get_outputs():
    space_output[it.name] = popart.AnchorReturnType("ALL")

if 'PROF' in os.environ:
    popart.getLogger().setLevel("DEBUG")

anchors = space_output

dataFeed = popart.DataFlow(1, anchors)

try:
    session = popart.InferenceSession(
        model_path, dataFeed,
        popart.DeviceManager().acquireAvailableDevice())
    print('Using IPU Hardware ..')
except:
    session = popart.InferenceSession(
        model_path, dataFeed,
        popart.DeviceManager().createIpuModelDevice({}))
    print('Using IPU Model ..')

session.prepareDevice()

anchors = session.initAnchorArrays()
stepio = popart.PyStepIO(space_input, anchors)

session.run(stepio)
import time
예제 #21
0
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
import popart

import torch.onnx
import torchvision

input_ = torch.FloatTensor(torch.randn(4, 3, 224, 224))
model = torchvision.models.alexnet(pretrained=True)

output_name = "output"

torch.onnx.export(model, input_, "alexnet.onnx", output_names=[output_name])

# Create a runtime environment
anchors = {output_name: popart.AnchorReturnType("All")}
dataFlow = popart.DataFlow(100, anchors)
device = popart.DeviceManager().createCpuDevice()

session = popart.InferenceSession("alexnet.onnx", dataFlow, device)
예제 #22
0
if args.input_tensor:
    input_ = args.input_tensor
else:
    input_ = builder.getInputTensorIds()[0]
if args.output_tensor:
    output = args.output_tensor
else:
    output = builder.getOutputTensorIds()[0]

print("Input:", input_, "Output:", output)
graph_transformer = popart.GraphTransformer(onnx_model)
graph_transformer.convertAllFixedPointInitializersToConstants()

# Create forward pass session
session = popart.InferenceSession(
    fnModel=graph_transformer.getModelProto(),
    dataFlow=popart.DataFlow(1, {output: popart.AnchorReturnType("All")}),
    deviceInfo=popart.DeviceManager().createIpuModelDevice({}))

# Compile graph
print("Compiling...")
session.prepareDevice()

# Create buffers to receive results from the execution
inferenceAnchors = session.initAnchorArrays()
stepio = popart.PyStepIO({input_: inputs[0]}, inferenceAnchors)

# Run the inference graph
session.run(stepio)

# Check the output from the test data is approximately equal to our inference
try:
예제 #23
0
def test_stepio_callbackinput(tmpdir):

    builder = popart.Builder()
    shape = popart.TensorInfo("FLOAT", [2])

    i1 = builder.addInputTensor(shape)
    i2 = builder.addInputTensor(shape)
    o = builder.aiOnnx.add([i1, i2])
    builder.addOutputTensor(o)

    proto = builder.getModelProto()

    batches_per_step = 2

    dataFlow = popart.DataFlow(
        batches_per_step, {
            i1: popart.AnchorReturnType("All"),
            i2: popart.AnchorReturnType("All"),
            o: popart.AnchorReturnType("All")
        })

    session = popart.InferenceSession(fnModel=proto,
                                      dataFlow=dataFlow,
                                      deviceInfo=tu.create_test_device())

    session.prepareDevice()

    anchors = session.initAnchorArrays()

    i1_data = np.random.rand(batches_per_step, 2).astype(np.float32)
    i2_data = np.random.rand(batches_per_step, 2).astype(np.float32)

    inputs = {i1: i1_data, i2: i2_data}

    i1_c = 0
    i2_c = 0

    def input_callback(id, prefetch):
        nonlocal i1_c, i2_c

        time.sleep(2)
        print("input_callback ", id)

        t = inputs[id]

        print(t)

        if id == i1:
            print("input_callback ", id, len(t))
            if (i1_c < len(t)):
                result = t[i1_c]
                i1_c = i1_c + 1

        if id == i2:
            print("input_callback ", id, len(t))
            if (i2_c < len(t)):
                result = t[i2_c]
                i2_c = i2_c + 1

        print(result)

        return result

    def input_complete_callback(id):
        print("input_complete_callback ", id)

    i1_d = 0
    i2_d = 0
    o_d = 0

    def output_callback(id):
        nonlocal i1_d, i2_d, o_d

        time.sleep(2)
        print("output_callback ", id)

        t = anchors[id]

        if id == i1:
            result = t[i1_d]
            i1_d = i1_d + 1

        if id == i2:
            result = t[i2_d]
            i2_d = i2_d + 1

        if id == o:
            result = t[o_d]
            o_d = o_d + 1

        return result

    def output_complete_callback(id):
        print("output_complete_callback ", id)

    stepio = popart.PyStepIOCallback(input_callback, input_complete_callback,
                                     output_callback, output_complete_callback)

    session.run(stepio)

    # confirm that writing device-to-host of a Stream Tensor returns correctly (unchanged)
    assert (np.allclose(anchors[i1], i1_data))
    assert (np.allclose(anchors[i2], i2_data))

    expected_result = i1_data + i2_data
    assert (np.allclose(anchors[o], expected_result))
예제 #24
0
def main():
    net = Net()

    criterion = nn.NLLLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    inputs, labels = iter(trainloader).next()

    opts = popart.SessionOptions()

    start = time.process_time()
    # Pass all the pytorch stuff to the session
    torchSession = popart.torch.TrainingSession(
        torchModel=net,
        inputs=inputs,
        targets=labels,
        optimizer=optimizer,
        losses=criterion,
        batch_size=batch_size,
        batches_per_step=batches_per_step,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(1),
        userOptions=opts)
    print("Converting pytorch model took {:.2f}s".format(time.process_time() -
                                                         start))

    # Prepare for training.
    anchors = torchSession.initAnchorArrays()

    print("Compiling model...")
    torchSession.prepareDevice()

    torchSession.weightsFromHost()

    for epoch in range(10):  # loop over the dataset multiple times
        start_time = time.time()

        running_loss = 0.0
        running_accuracy = 0
        print("#" * 20, "Train phase:", "#" * 20)
        for i, data in enumerate(trainloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            torchSession.run(inputs, labels)
            running_loss += np.mean(anchors["loss_0"])

            progress = 20 * (
                i + 1) * batch_size * batches_per_step // len(trainset)
            print('\repoch {} [{}{}]  '.format(epoch + 1, progress * '.',
                                               (20 - progress) * ' '),
                  end='')

            results = np.argmax(
                anchors['output_0'].reshape(
                    [batches_per_step * batch_size, 10]), 1)
            num_correct = np.sum(results == anchors['target_0'].reshape(
                [batches_per_step * batch_size]))
            running_accuracy += num_correct
        print("Accuracy: {}%".format(running_accuracy * 100 / len(trainset)))

        end_time = time.time()
        print('loss: {:.2f}'.format(running_loss / (i + 1)))
        print("Images per second: {:.0f}".format(
            len(trainset) / (end_time - start_time)))

        # Save the model with weights
        torchSession.modelToHost("torchModel.onnx")

        # Pytorch currently doesn't support importing from onnx:
        # https://github.com/pytorch/pytorch/issues/21683
        # And pytorch->onnx->caffe2 is broken:
        # https://github.com/onnx/onnx/issues/2463
        # So we import into popart session and infer.
        # Alternatively, use any other ONNX compatible runtime.

        builder = popart.Builder("torchModel.onnx")

        inferenceSession = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(
                batches_per_step,
                {"output_0": popart.AnchorReturnType("All")}),
            deviceInfo=popart.DeviceManager().acquireAvailableDevice(1))

        print("Compiling test model...")
        inferenceSession.prepareDevice()
        inferenceAnchors = inferenceSession.initAnchorArrays()
        print("#" * 20, "Test phase:", "#" * 20)
        test_accuracy = 0
        for j, data in enumerate(testloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            stepio = popart.PyStepIO({"input_0": inputs.data.numpy()},
                                     inferenceAnchors)

            inferenceSession.run(stepio)

            progress = 20 * (j +
                             1) * batch_size * batches_per_step // len(testset)
            print('\rtest epoch {} [{}{}]  '.format(epoch + 1, progress * '.',
                                                    (20 - progress) * ' '),
                  end='')

            results = np.argmax(
                inferenceAnchors['output_0'].reshape(
                    [batches_per_step * batch_size, 10]), 1)
            num_correct = np.sum(results == labels.data.numpy().reshape(
                [batches_per_step * batch_size]))
            test_accuracy += num_correct

        print("Accuracy: {}%".format(test_accuracy * 100 / len(testset)))
    print('Finished Training')
예제 #25
0
def main(argv):
    FLAGS = flags.FLAGS
    print(f"micro batch size is {FLAGS.micro_batch_size}")
    print(f"batch size is {FLAGS.batch_size}")
    print(f"batches_per_step is {FLAGS.batches_per_step}")
    proto, data, outputs, output_id = graph_builder()
    print(f"Model: {FLAGS.model_name}")
    if not FLAGS.synthetic:
        print(f"Data_dir: {FLAGS.data_dir}")
    else:
        print(f"Using synthetic data")
    print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}")
    print(f"num_workers: {FLAGS.num_workers}")
    print(f"batches per step: {FLAGS.batches_per_step}")
    dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs)

    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if FLAGS.synthetic:
        options.syntheticDataMode = popart.SyntheticDataMode.Zeros
    options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count

    # Configure precision of convolutions and MatMuls
    if FLAGS.half_partials:
        options.convolutionOptions = {'partialsType': 'half'}
        options.partialsTypeMatMuls = "half"

    # Select a device
    deviceManager = popart.DeviceManager()
    device = deviceManager.acquireAvailableDevice(1)
    print(f"{device}\n")
    if device is None:
        raise Exception("Not enough IPUs available.")

    session = popart.InferenceSession(fnModel=proto,
                                      deviceInfo=device,
                                      dataFlow=dataFlow,
                                      userOptions=options)

    print("Compiling...")
    start = time.time()
    try:
        session.prepareDevice()
    except popart.PrepareDeviceException as e:
        import gcprofile
        gcprofile.save_popart_report(session, exception=e)
        sys.exit(1)
    compilation_duration = time.time() - start
    print("Time to compile: {:.3f} seconds\n".format(compilation_duration))

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()
    # Copy weights and optimisation parameters onto the device
    session.weightsFromHost()

    def report_time(duration, data_duration=None, compute_duration=None):
        report_string = "Total {:<8.3} sec.".format(duration)
        if data_duration:
            report_string += "   Preprocessing {:<8.3} sec ({:4.3}%).".format(
                data_duration, 100 * (data_duration / duration))
        if compute_duration:
            report_string += "   Compute {:<8.3} sec ({:4.3}%).".format(
                compute_duration, 100 * (compute_duration / duration))
        report_string += "   {:5f} images/sec.".format(
            int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration))
        print(report_string)
        if FLAGS.report_hw_cycle_count:
            print("Hardware cycle count per 'run':", session.getCycleCount())

    print("Executing...")
    average_batches_per_sec = 0

    # Run
    start = time.time()
    durations = []
    if FLAGS.synthetic:
        for i in range(FLAGS.iterations):
            stepio = popart.PyStepIO(data, anchors)
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
    else:
        for d in data:
            stepio = popart.PyStepIO(d, anchors)
            # Calc data duration
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
예제 #26
0
    d3 = np.random.rand(1, 3 * hidden_size, hidden_size).astype(np.float32)

    i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", d1.shape))
    i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", d2.shape))
    i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", d3.shape))
    Y, Y_h = builder.aiOnnx.gru([i1, i2, i3],
                                2,
                                clip=None,
                                direction="bidirectional")
    builder.addOutputTensor(Y)

    dataFlow = popart.DataFlow(1, {Y: popart.AnchorReturnType("All")})

    # Create a session to compile and the graph for inference
    #------------------------------------------------------------------------------
    inferenceOptions = popart.SessionOptions()
    # Need to compile the inference graph with variable weights we they can be updated
    # before execution

    inferenceSession = popart.InferenceSession(
        fnModel=builder.getModelProto(),
        dataFlow=dataFlow,
        userOptions=inferenceOptions,
        deviceInfo=popart.DeviceManager().createIpuModelDevice({}))

    # Compile graph
    inferenceSession.prepareDevice()

    # Create buffers to receive results from the execution
    inferenceAnchors = inferenceSession.initAnchorArrays()
예제 #27
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      doProfiling=False,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 2
    shape_d0 = [batchSize, 2, 4, 4]
    shape_l0 = [batchSize]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIPU=20))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIPU=20))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        shape_l0.insert(0, batchesPerStep)
    data = np.random.uniform(low=-10.0, high=10.0,
                             size=shape_d0).astype(np.float32)
    classes = np.prod(shape_d0) / (batchSize * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
예제 #28
0
def get_model_anchors_model1(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             labelArray=None):
    micro_batch_size = batch_size // gradAcclFactor
    builder = popart.Builder()

    input_shape = [micro_batch_size, hidden_size]
    input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape))

    x = input_
    with builder.virtualGraph(0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_0_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(1 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_1_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(2 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_2_{i}")
            if i == 1: w0 = w
            x = builder.aiOnnx.matmul([x, w])
        label = builder.addInputTensor("INT32", [micro_batch_size])
        x = builder.aiGraphcore.nllloss([x, label])

    output = x

    builder.addOutputTensor(output)

    art = popart.AnchorReturnType("All")
    anchor_map = {x: art, w0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + x] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + x] = art
            anchor_map[popart.reservedRestoredPrefix() + w0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    if doSharding is False:
        numIPUs = 1
    else:
        numIPUs = 3

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=output,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_: np.ones(input_shape, np.float32),
            label: labelArray.astype(np.int32)
        }, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    return anchors