def get_updated_p1_popart(): builder = popart.Builder() # Computation is out = matmul(i1, i2) c1 = builder.addInputTensor(popart.TensorInfo("FLOAT", c1_shape)) p1 = builder.addInitializedInputTensor(p1_init) out = builder.aiOnnx.matmul([c1, p1]) # Set up a training session. device = tu.create_test_device() dataFlow = popart.DataFlow( 1, { c1: popart.AnchorReturnType("Final"), p1: popart.AnchorReturnType("Final"), out: popart.AnchorReturnType("Final") }) # We're testing losses other than nll/l1 work. loss = builder.aiOnnx.reducesum([out]) optimizer = popart.SGD({ "defaultLearningRate": (sgd_learning_rate, True), "defaultMomentum": (sgd_moment, False), "lossScaling": (200, constLossScaling) }) session = popart.TrainingSession(builder.getModelProto(), deviceInfo=device, dataFlow=dataFlow, loss=loss, optimizer=optimizer) session.prepareDevice() session.weightsFromHost() # Run the popart session to get an answer. anchors = session.initAnchorArrays() stepio = popart.PyStepIO({c1: c1_init}, anchors) session.run(stepio) return anchors[c1], anchors[p1], anchors[out]
def test_distributed_replicated_allreduce(): mpi_params = get_mpi_params() mpi_size, mpi_rank = mpi_params input_data = np.array(range(10), dtype=np.float32) builder = popart.Builder() t = builder.addInitializedInputTensor(input_data, "input") o = builder.aiGraphcore.replicatedallreduce([t]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.enableReplicatedGraphs = False opts.enableDistributedReplicatedGraphs = True opts.globalReplicaOffset = mpi_rank opts.globalReplicationFactor = 2 numIpus = 1 device = tu.create_test_device(numIpus=numIpus) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=device) session.prepareDevice() anchors = session.initAnchorArrays() inputs = {} stepio = popart.PyStepIO(inputs, anchors) session.run(stepio) ground_truth = 2.0 * np.array(range(10), dtype=np.float32) assert np.allclose(anchors[o], ground_truth)
def test_stream_on_off(tmpdir): builder = popart.Builder() shape = popart.TensorInfo("FLOAT16", [2]) i1 = builder.addInputTensor(shape) i2 = builder.addInputTensor(shape) o = builder.aiOnnx.add([i1, i2]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow( 1, { i1: popart.AnchorReturnType("All"), i2: popart.AnchorReturnType("All"), o: popart.AnchorReturnType("All") }) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=tu.create_test_device()) session.prepareDevice() anchors = session.initAnchorArrays() inputs = { i1: np.array([1., 3.], dtype=np.float16), i2: np.array([7., 8.], dtype=np.float16) } stepio = popart.PyStepIO(inputs, anchors) session.run(stepio) # confirm that writing device-to-host of a Stream Tensor returns correctly (unchanged) assert (np.allclose(anchors[i1], np.array([1., 3.], dtype=np.float16))) assert (np.allclose(anchors[i2], np.array([7., 8.], dtype=np.float16)))
def check_op_with_invalid_axes(opset, reduceOp, axis): with pytest.raises(popart.popart_exception) as e_info: builder = popart.Builder() tensor_info = popart.TensorInfo("FLOAT", SHAPE) x = builder.addInputTensor(tensor_info, "input") # reducemedian returns 2 outputs in an array, so we convert the singleton outputs into arrays as well ys = getattr(getattr(builder, opset), reduceOp)([x], axes=[axis]) if not isinstance(ys, list): ys = [ys] for y in ys: builder.addOutputTensor(y) anchors = {y: popart.AnchorReturnType("ALL") for y in ys} proto = builder.getModelProto() dataFlow = popart.DataFlow(1, anchors) device = popart.DeviceManager().createCpuDevice() session = popart.InferenceSession(proto, dataFlow, device) # this should throw an error assert (e_info.value.args[0] == ( "Axis {} is out of acceptable range [{}, {}]").format( axis, -RANK, RANK - 1))
def run_lstm_popart(onnx_file_name, inputs): # generate a popart session builder = popart.Builder(onnx_file_name) outputs = builder.getOutputTensorIds() dataFlow = popart.DataFlow(1, outputs) device = tu.create_test_device(1) s = popart.InferenceSession(fnModel=onnx_file_name, dataFlow=dataFlow, deviceInfo=device) anchor_map = s.initAnchorArrays() s.prepareDevice() # run the popart session input_map = { 'X': inputs[0], 'initial_h': inputs[1], 'initial_c': inputs[2] } stepio = popart.PyStepIO(input_map, anchor_map) s.run(stepio) return (anchor_map['Y'], anchor_map['Y_h'], anchor_map['Y_c'])
def test_enabled_recomputation(): """ In this test we check that NO error is thrown when doing pipelining if recomputation is enabled """ builder, op0_out, op1_out, op2_out, op3_out, anchor_map = get_simple_linear_model( ) opts = popart.SessionOptions() opts.enablePipelining = True opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.autoRecomputation = popart.RecomputationType.Standard builder.virtualGraph(op0_out, 0) builder.virtualGraph(op1_out, 1) builder.virtualGraph(op2_out, 1) builder.virtualGraph(op3_out, 1) session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(10, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device( numIpus=2, tilesPerIpu=20))
def getTrainingSession(fn): opts = popart.SessionOptions() opts.enableGradientAccumulation = True opts.accumulationFactor = accum_factor opts.disableGradAccumulationTensorStreams = False if explicit_loops: opts.enableExplicitMainLoops = True opts.aliasZeroCopy = True opts.explicitRecomputation = True opts.useHostCopyOps = True sess = popart.TrainingSession( fnModel=fn, dataFlow=popart.DataFlow(1, {}), deviceInfo=tu.create_test_device(tilesPerIPU=testTilesPerIPU), loss=output_name, optimizer=adam_optimizer, userOptions=opts) sess.prepareDevice() sess.weightsFromHost() return sess
def test_fail_due_to_mismatch_permutation(): d1 = np.random.randn(10, 20, 30).astype(np.float32) builder = popart.Builder() d = builder.addInputTensor("FLOAT", d1.shape) o = builder.aiOnnx.transpose([d], perm=(0, 2, 1)) o = builder.aiOnnx.transpose([o], perm=(1, 2, 0)) sess = popart.InferenceSession(fnModel=builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, [o])) sess.prepareDevice() anchors = sess.initAnchorArrays() stepio = popart.PyStepIO({d: d1}, anchors) sess.weightsFromHost() sess.run(stepio) ir = json.loads(sess._serializeIr(popart.IrSerializationFormat.JSON)) assert len( list(filter(lambda op: "Transpose" in op["type"], ir["maingraph"]))) == 2
def test_no_virtual_graph(): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) o1 = builder.aiOnnx.add([i1, i2]) o2 = builder.aiOnnx.add([i1, i2]) o = builder.aiOnnx.add([o1, o2]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=tu.create_test_device())
def test_auto_loss_scaling_bin_edge_factor_range(binEdgeLocation): """Test if an error is thrown if the binEdgeLocation hyperparameter is outside [0, 1]. """ builder = popart.Builder() t0 = builder.addInputTensor("FLOAT", [2, 2]) t1_data = np.random.rand(2, 2).astype(np.float32) t1 = builder.addInitializedInputTensor(t1_data) mm0 = builder.aiOnnx.matmul([t0, t1]) loss = builder.aiGraphcore.identityloss([mm0]) optimizer = popart.SGD({"lossScaling": (2, False)}) opts = popart.SessionOptions() opts.automaticLossScalingSettings.enabled = True opts.automaticLossScalingSettings.binEdgeLocation = binEdgeLocation opts.automaticLossScalingSettings.thresholdUpperCountProportion = 0.2 with ExitStack() as stack: e_info = None if not 0 <= binEdgeLocation <= 1: e_info = stack.enter_context(pytest.raises( popart.popart_exception)) session = popart.TrainingSession(builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, [loss]), loss=loss, optimizer=optimizer, userOptions=opts) if e_info: assert e_info.value.args[0].startswith( "[AutomaticLossScale transform] Out of range value for 'binEdgeLocation'." ) else: session.prepareDevice()
def test_overwriting_external_data_file(): # Verify that if calling modelToHost twice, the external data is overwritten # correctly, and not corrupted! builder = popart.Builder() d1 = np.random.rand(3, 3).astype(np.float32) i1 = builder.addInitializedInputTensor(d1) o = builder.aiOnnx.matmul([i1, i1]) loss = builder.aiGraphcore.identityloss([o]) with TemporaryDirectory() as tmpdir: tmpfile0 = os.path.join(tmpdir, "model_tensors0.onnx") builder.saveInitializersExternally([i1], tmpfile0) optimizer = popart.SGD({ "defaultLearningRate": (0.2, True), "defaultMomentum": (0.5, True) }) session = popart.TrainingSession( deviceInfo=popart.DeviceManager().createCpuDevice(), fnModel=builder.getModelProto(), loss=loss, optimizer=optimizer, dataFlow=popart.DataFlow(1, [])) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() session.run(popart.PyStepIO({}, anchors)) # Should overwrite external data with the same data tmpfile1 = os.path.join(tmpdir, "model0.onnx") session.modelToHost(tmpfile1) weights0 = np.fromfile(tmpfile0, dtype=np.float32) session.modelToHost(tmpfile1) weights1 = np.fromfile(tmpfile0, dtype=np.float32) assert np.array_equal(weights0, weights1)
def test_identity_inference_session(inputShape, inputArray, BPS, art, R, explicit, expected): builder = popart.Builder() inInfo = popart.TensorInfo("FLOAT", inputShape) i1 = builder.addInputTensor(inInfo) o = builder.aiOnnx.identity([i1]) builder.addOutputTensor(o) proto = builder.getModelProto() batchesPerStep = BPS dataFlow = popart.DataFlow(batchesPerStep, {o: art}) opts = popart.SessionOptions() opts.replicatedGraphCount = R opts.enableReplicatedGraphs = R > 1 opts.enableExplicitMainLoops = explicit opts.useHostCopyOps = explicit device = tu.create_test_device(numIpus=R) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=device, userOptions=opts) session.prepareDevice() anchors = session.initAnchorArrays() inputs = {i1: np.array(inputArray, dtype=np.float32)} stepio = popart.PyStepIO(inputs, anchors) session.run(stepio) assert (np.array_equal(anchors[o], expected))
def test_ipu_copy_bca2(): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) o1 = builder.aiOnnx.add([i1, i2]) o2 = builder.aiOnnx.add([i1, i2]) o3 = builder.aiOnnx.add([o1, o2]) o4 = builder.aiOnnx.add([o1, o2]) o = builder.aiOnnx.add([o3, o4]) builder.addOutputTensor(o) builder.virtualGraph(o1, 0) builder.virtualGraph(o2, 0) builder.virtualGraph(o3, 1) builder.virtualGraph(o4, 1) builder.virtualGraph(o, 2) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual s = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=3)) s.prepareDevice()
def get_simple_model_cycle_count(bps): builder = popart.Builder() # Make the model large enough such that the cycle count is dominated # by compute and internal exchange (as apposed to host exchage) d_shape = [200, 200] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", d_shape)) out = d0 for layer in range(100): out = builder.aiOnnx.sin([out]) opts = popart.SessionOptions() opts.instrumentWithHardwareCycleCounter = True # Verify that we can still measure cycles when data streams # (inuts/weights/anchors) are off opts.syntheticDataMode = popart.SyntheticDataMode.Zeros session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(bps, {out: popart.AnchorReturnType("All")}), userOptions=opts, deviceInfo=tu.create_test_device(), patterns=popart.Patterns(popart.PatternsLevel.NoPatterns)) session.prepareDevice() anchors = session.initAnchorArrays() if bps > 1: d_shape.insert(0, bps) stepio = popart.PyStepIO({d0: np.random.rand(*d_shape).astype(np.float32)}, anchors) session.run(stepio) cycles = session.getCycleCount() cycles_ = session.getCycleCount() print("BPS: ", bps, " Cycles: ", cycles) # Verify that the tensor is not overwritten when streaming off device assert (cycles == cycles_) return cycles
def get_replicated_dropout_session(replication_factor=4, dsize=10, num_layers=1, ratio=0.3, batches_per_step=1, seed=0): builder = popart.Builder() ip = builder.addInputTensor(popart.TensorInfo("FLOAT", [dsize])) d__ip = popart.reservedGradientPrefix() + ip out = ip for layer in range(num_layers): [out] = builder.aiOnnx.dropout([out], num_outputs=1, ratio=ratio) loss = builder.aiGraphcore.identityloss([out]) builder.addOutputTensor(loss) device = tu.create_test_device(replication_factor) dfAnchors = [out, ip, d__ip] dfAnchors = {i: popart.AnchorReturnType("All") for i in dfAnchors} opts = popart.SessionOptions() opts.enableReplicatedGraphs = True opts.replicatedGraphCount = replication_factor session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batches_per_step, dfAnchors), optimizer=popart.ConstSGD(0.1), loss=loss, userOptions=opts, deviceInfo=device) session.prepareDevice() session.setRandomSeed(seed) session.weightsFromHost() anchors = session.initAnchorArrays() return session, ip, out, d__ip, anchors
def test_np_memory_layout_add_initialized_input_tensor1(): """ Test that when we create a parameter input with a non-contiguous array things still work (first test). """ np.random.seed(1) # Build a computational graph. Initialise an input parameter with a transposed # input (which happens to be non-contiguous in numpy). builder = popart.Builder() input1Value = np.random.randint(0, 100, size=(2, 3), dtype='int32') input1Value = np.transpose(input1Value, [1, 0]) input1 = builder.addInitializedInputTensor(input1Value) input1 = builder.aiOnnx.identity([input1]) builder.addOutputTensor(input1) # Perpare a session. anchorConfig = {input1: popart.AnchorReturnType("ALL")} dataFlow = popart.DataFlow(1, anchorConfig) deviceConfig = {'numIPUs': 1} dm = popart.DeviceManager() device = dm.createIpuModelDevice(deviceConfig) session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() # Run the session. stepio = popart.PyStepIO({}, anchors) session.run(stepio) # Compare outputs. assert (anchors[input1] == input1Value ).all(), f"Expected {anchors[input1]} to match {input1Value}"
def test_valid_recompute_options(): builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) r1 = builder.aiOnnx.relu([i1]) o = builder.aiOnnx.relu([r1]) # specify manual recomputation builder.recomputeOutputInBackwardPass(r1) # specify auto recomputation as well opts = popart.SessionOptions() opts.autoRecomputation = popart.RecomputationType.Standard with pytest.raises(popart.popart_exception) as e_info: session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(1, [o]), optimizer=popart.ConstSGD(0.001), loss=o, patterns=popart.Patterns([]), userOptions=opts, deviceInfo=tu.create_test_device()) assert (e_info.value.args[0] == "A mixture of auto and manual recomputaion is not supported")
def test_execution_report(tmpdir): builder = popart.Builder() shape = popart.TensorInfo("FLOAT", [1]) i1 = builder.addInputTensor(shape) i2 = builder.addInputTensor(shape) o = builder.aiOnnx.add([i1, i2]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) anchors = session.initAnchorArrays() session.prepareDevice() d1 = np.array([10.]).astype(np.float32) d2 = np.array([11.]).astype(np.float32) stepio = popart.PyStepIO({i1: d1, i2: d2}, anchors) session.run(stepio, "Test message") rep = session.getExecutionReport() # Need to convert bytes to string details = json.loads(rep.decode("utf-8")) assert (details['runs'][0]['name'] == "Test message")
def test_view_simplify(a, b, target): d1 = np.random.randn(10, 20).astype(np.float32) builder = popart.Builder() d = builder.addInputTensor("FLOAT", d1.shape) o = a(builder, d, [1, *d1.shape]) o = b(builder, o, [*reversed(d1.shape)]) opts = popart.SessionOptions() # ViewSimplifyPattern only runs when outlining opts.enableOutlining = True # Set the threshold high so nothing actually gets outlined. # This makes it easier to parse the IR. opts.outlineThreshold = 100000 sess = popart.InferenceSession(fnModel=builder.getModelProto(), deviceInfo=tu.create_test_device(), dataFlow=popart.DataFlow(1, [o])) sess.prepareDevice() anchors = sess.initAnchorArrays() stepio = popart.PyStepIO({d: d1}, anchors) sess.weightsFromHost() sess.run(stepio) ir = json.loads(sess._serializeIr(popart.IrSerializationFormat.JSON)) def outputs_o(op): return o in map(lambda t: t["name"], op["outputs"]) def matches_target(op): return target in op["type"] and outputs_o(op) assert len(list(filter(matches_target, ir["maingraph"]))) == 1 assert np.allclose(anchors[o].flatten(), d1.flatten())
def test_auto_virtual_graph_train(): ipus = 2 popart.getLogger().setLevel("TRACE") builder = popart.Builder() input_shape = [1, 64] input = builder.addInputTensor(popart.TensorInfo("FLOAT16", input_shape)) x = input for i in range(ipus): w = builder.addInitializedInputTensor(np.zeros([64, 64], np.float16)) x = builder.aiOnnx.matmul([x, w]) output = x builder.addOutputTensor(output) loss = builder.aiGraphcore.identityloss([output]) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {loss: popart.AnchorReturnType("Final")}) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Auto device = tu.create_test_device(numIpus=ipus) popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=popart.SGD( {"defaultLearningRate": (0.01, True)}), deviceInfo=device)
def test_execution_report_reset(tmpdir): builder = popart.Builder() shape = popart.TensorInfo("FLOAT", [1]) i1 = builder.addInputTensor(shape) i2 = builder.addInputTensor(shape) o = builder.aiOnnx.add([i1, i2]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.engineOptions = {"debug.instrument": "true"} session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) anchors = session.initAnchorArrays() session.prepareDevice() d1 = np.array([10.]).astype(np.float32) d2 = np.array([11.]).astype(np.float32) stepio = popart.PyStepIO({i1: d1, i2: d2}, anchors) session.run(stepio) rep1 = session.getExecutionReport(resetProfile=False) rep2 = session.getExecutionReport(resetProfile=False) assert len(rep1) == len(rep2)
def test_matmul_serialization_invalid_factor(tmpdir): lhs_shape = [2, 2] rhs_shape = [2, 4] lhs_data = np.random.rand(*lhs_shape).astype(np.float32) rhs_data = np.random.rand(*rhs_shape).astype(np.float32) builder = popart.Builder() lhs = builder.addInputTensor(popart.TensorInfo("FLOAT", lhs_shape), "lhs") rhs = builder.addInputTensor(popart.TensorInfo("FLOAT", rhs_shape), "rhs") o = builder.aiOnnx.matmul([lhs, rhs]) builder.setSerializeMatMul({o}, "output_channels", 3) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = getBaseOptions() pat = popart.Patterns(['MatMulOp', 'MatMulRhsGradOp', 'MatMulLhsGradOp']) pat.enableRuntimeAsserts(False) with pytest.raises(popart.popart_exception) as e_info: session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, userOptions=opts, patterns=pat, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) assert (e_info.value.args[0].startswith( "Invalid serialisation factor 3 for output channels dim 4. output_channels dim should be a multple of the serialisation factor" ))
def getAnchors(extraReduction): builder = popart.Builder() ip = builder.addInitializedInputTensor(ip_data) lb = builder.addInputTensor("INT32", lshape) sm = builder.aiOnnx.softmax([ip], axis=np.size(lshape)) if extraReduction == True: nll = builder.aiGraphcore.nllloss( [sm, lb], reduction=popart.ReductionType.NoReduction) loss = builder.aiOnnx.reducesum([nll]) else: loss = builder.aiGraphcore.nllloss( [sm, lb], reduction=popart.ReductionType.Sum) anchors = [popart.reservedGradientPrefix() + ip] # Always test 'loss' too, except for when we want to test with # the SoftmaxGradDirect pattern, which requires 'loss' to be # anchored if 'SoftmaxGradDirect' not in patternsList or 'NlllWithSoftmaxGradDirect' in patternsList: anchors.append(loss) session = popart.TrainingSession( fnModel=builder.getModelProto(), loss=loss, dataFlow=popart.DataFlow(1, anchors), optimizer=popart.ConstSGD(0.1), deviceInfo=tu.create_test_device(), patterns=popart.Patterns( patternsList).enableRuntimeAsserts(False)) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() stepio = popart.PyStepIO({lb: lb_data.astype(np.int32)}, anchors) session.run(stepio) return anchors
def test_summary_report_before_execution(tmpdir): builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32])) o = builder.aiOnnx.add([i1, i2]) builder.addOutputTensor(o) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, deviceInfo=tu.create_test_device()) session.initAnchorArrays() with pytest.raises(popart.popart_exception) as e_info: session.getSummaryReport() assert (e_info.value.args[0].endswith( "Session must have been prepared before a report can be fetched"))
def run_pt_session(syntheticDataMode, inputType=None, d_shape=[100]): builder = popart.Builder() if inputType is not None: d0_i8 = builder.addInputTensor( popart.TensorInfo(inputType.builder_type, d_shape)) d0 = builder.aiOnnx.cast([d0_i8], "FLOAT") in_name = d0_i8 else: d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", d_shape)) in_name = d0 p = builder.aiGraphcore.printtensor([d0]) opts = popart.SessionOptions() opts.syntheticDataMode = syntheticDataMode session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(1, [p]), userOptions=opts, deviceInfo=tu.create_test_device()) session.prepareDevice() anchors = session.initAnchorArrays() stepio = popart.PyStepIO({in_name: np.ones(d_shape)}, anchors) session.run(stepio)
def run_embedding_layer(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax # TODO: Change slice to opset10 model = Bert(config, builder=popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }), initializers=initializers, execution_mode=args.execution_mode) # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector. indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = tuple([model.embedding(indices, positions, segments)]) if args.inference: outputs = bert_add_logit_outputs(model, logits) writer = None dataset = get_bert_dataset( model, args, [indices, positions, segments, masks, labels]) data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration( args, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus = bert_required_ipus(args, model) device = acquire_device(args, request_ipus) session, anchors = bert_inference_session(model, args, data_flow, device) logger.info("Inference Started") inputs = [indices, positions, segments, *masks] """bert_infer_loop(args, session, dataset, inputs, logits, anchors, iteration)""" save_results = args.task == "SQUAD" and not (args.synthetic_data or args.generated_data) start_times = defaultdict(list) end_times = defaultdict(list) # Create the stepio once outside of the inference loop: static_data = {} if args.low_latency_inference and args.task == "SQUAD": stepio = create_callback_stepio(static_data, anchors, start_times, end_times, dataset.batches_per_step) else: stepio = None output = [] logger.info(dataset) for data in dataset: static_data.update({t: data[t] for t in inputs}) result = bert_process_infer_data(args, session, static_data, anchors, logits, iteration, start_times, end_times, stepio) if save_results: output.append(result) break device.detach() return output return None
def test_save_back_externally_saved_tensors(): """ Test that initializers (stored externally in the onnx model) that are updated in a training session are written back correctly when the onnx model is written using the Session API Model: in0 - \ Matmul0 - Matmul1 - out / / w0 -- w1-- """ builder = popart.Builder() shape = [4, 4] elms = np.prod(shape) numLayers = 2 in0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape)) initWeights = [] weightsIds = [] anchorsDef = {} out = in0 for layer in range(numLayers): w_init = np.random.rand(*shape).astype('float32') initWeights.append(w_init) weightsIds.append(builder.addInitializedInputTensor(w_init)) anchorsDef[weightsIds[layer]] = popart.AnchorReturnType("All") out = builder.aiOnnx.matmul([out, weightsIds[layer]]) loss = builder.aiGraphcore.identityloss([out]) tmpdir = tempfile.mkdtemp() tmpfile_weights = os.path.join(tmpdir, "weights.onnx") builder.saveInitializersExternally(weightsIds, tmpfile_weights) # Verify the initial weights are saved correctly for layer in range(numLayers): saved_weights = np.fromfile(tmpfile_weights, dtype=np.float32, count=elms, offset=layer * elms * 4) assert (np.array_equal(initWeights[layer].flatten(), saved_weights)) opts = popart.SessionOptions() session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(1, anchorsDef), deviceInfo=popart.DeviceManager().createCpuDevice(), optimizer=popart.ConstSGD(10), loss=loss) anchors = session.initAnchorArrays() inputs = {in0: np.random.rand(*shape).astype('float32')} stepio = popart.PyStepIO(inputs, anchors) session.prepareDevice() session.weightsFromHost() session.run(stepio) # Check the weights have been updated for layer in range(numLayers): assert not np.allclose(anchors[weightsIds[layer]], initWeights[layer]) # Save the model with updated weights back to disk tmpfile_model = os.path.join(tmpdir, "model.onnx") session.modelToHost(tmpfile_model) # Verify that the file containing tensor data has also been updated for layer in range(numLayers): saved_weights = np.fromfile(tmpfile_weights, dtype=np.float32, count=elms, offset=layer * elms * 4) assert np.array_equal(anchors[weightsIds[layer]].flatten(), saved_weights)
def main(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax model = Bert(config, builder=popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }), initializers=initializers, execution_mode=args.execution_mode) # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector. indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = bert_logits_graph(model, indices, positions, segments, masks) if args.inference: predictions = None losses = [] if args.task == "PRETRAINING": # If this is a pretraining session, labels for NSP and MLM are already within the dataset, # so we can always calculate prediction performance predictions, _ = bert_infer_graph(model, logits, include_probs=False) if args.inference_lm_perplexity: losses = bert_perplexity_graph(model, logits, labels) outputs = bert_add_validation_outputs(model, predictions, losses) else: if args.inference_lm_perplexity: raise RuntimeError( "Masked LM perplexity is only supported in pretraining.") outputs = bert_add_logit_outputs(model, logits) writer = None else: predictions, probs = bert_infer_graph(model, logits) losses = bert_loss_graph(model, probs, labels) outputs = bert_add_validation_outputs(model, predictions, losses) writer = bert_writer(args) embedding_dict, positional_dict = model.get_model_embeddings() dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels], embedding_dict, positional_dict, config.host_embedding == "MERGE") logger.info(f"Dataset length: {len(dataset)}") data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration(args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus, required_ipus = calc_required_ipus(args, model) device = acquire_device(args, request_ipus) if args.inference: session, anchors = bert_inference_session(model, args, data_flow, device) logger.info("Inference Started") inputs = [indices, positions, segments, *masks, *labels] bert_infer_loop(args, session, dataset, inputs, logits, anchors, labels, predictions, losses, iteration) device.detach() else: if not args.no_training: optimizer_factory = ScheduledOptimizerFactory( args, iteration, model.tensors) session, anchors = bert_training_session(model, args, data_flow, losses, device, optimizer_factory) logger.info("Training Started") bert_train_loop(args, session, writer, dataset, labels, predictions, losses, anchors, iteration, optimizer_factory) device.detach() logger.info("Training Finished") return session, iteration
def train(opts): if opts.fix_seed: print('Fixing the seed for result reproducibility') np.random.seed(0) train_data, train_labels, test_data, test_labels = load_mnist( opts.data_folder) # Limit batches_per_step so the test set isn't evaluated more than once. max_value = len(test_data) // opts.batch_size if max_value < opts.batches_per_step: print("(batches-per-step * batch-size) is larger than test set!\n" " Reduced batches-per-step to: {}\n".format(max_value)) opts.batches_per_step = max_value training_set = DataSet(opts.batch_size, opts.batches_per_step, train_data, train_labels) test_set = DataSet(opts.batch_size, opts.batches_per_step, test_data, test_labels) print("Creating ONNX model.") model = MNIST_model(hidden_size=opts.hidden_size) proto, data_in, labels_in, output, loss = model.create_proto( opts.batch_size) # Describe how to run the model anchor_desc = { output: popart.AnchorReturnType("ALL"), loss: popart.AnchorReturnType("ALL") } dataFlow = popart.DataFlow(opts.batches_per_step, anchor_desc) # Options userOpts = popart.SessionOptions() # The validation graph by default will be optimized to change all variables to constants # This prevents that, which allows for checkpoints to be loaded into the model without recompiling userOpts.constantWeights = False # Enable auto-sharding if opts.num_ipus > 1: userOpts.virtualGraphMode = popart.VirtualGraphMode.Auto # Enable pipelining if opts.pipeline: userOpts.enablePipelining = True userOpts.separateCallOpPdfs = False device = get_device(opts.num_ipus, opts.simulation) training = init_session(proto, loss, dataFlow, userOpts, device, training=True) validation = init_session(proto, loss, dataFlow, userOpts, device, training=False) print("Running training loop.") for i in range(opts.epochs): # Training training.session.weightsFromHost() for step, (data, labels) in enumerate(training_set): stepio = popart.PyStepIO({ data_in: data, labels_in: labels }, training.anchors) training.session.run( stepio, 'Epoch ' + str(i) + ' training step' + str(step)) aggregated_loss = 0 aggregated_accuracy = 0 training.session.modelToHost('ckpt.onnx') validation.session.resetHostWeights('ckpt.onnx') validation.session.weightsFromHost() # Evaluation for step, (data, labels) in enumerate(test_set): stepio = popart.PyStepIO({ data_in: data, labels_in: labels }, validation.anchors) validation.session.run( stepio, 'Epoch ' + str(i) + ' evaluation step ' + str(step)) # Loss aggregated_loss += np.mean(validation.anchors[loss]) # Accuracy results = np.argmax( validation.anchors[output].reshape( [test_set.inputs_per_step, 10]), 1) num_correct = np.sum( results == labels.reshape([test_set.inputs_per_step])) aggregated_accuracy += num_correct / test_set.inputs_per_step # Log statistics aggregated_loss /= len(test_set) aggregated_accuracy /= len(test_set) print("Epoch #{}".format(i + 1)) print(" Loss={0:.4f}".format(aggregated_loss)) print(" Accuracy={0:.2f}%".format(aggregated_accuracy * 100))
import cmdline from popart.torch import torchwriter #we require torch in this file to create the torch Module import torch args = cmdline.parse() nInChans = 3 nOutChans = 10 batchSize = 2 batchesPerStep = 4 anchors = { "loss": popart.AnchorReturnType("EveryN", 2), "image0": popart.AnchorReturnType("All") } dataFlow = popart.DataFlow(batchesPerStep, anchors) inputShapeInfo = popart.InputShapeInfo() inputShapeInfo.add("image0", popart.TensorInfo("FLOAT", [batchSize, nInChans, 32, 32])) inputShapeInfo.add("image1", popart.TensorInfo("FLOAT", [batchSize, nInChans, 32, 32])) inputShapeInfo.add("label", popart.TensorInfo("INT32", [batchSize])) inNames = ["image0", "image1"] cifarInIndices = {"image0": 0, "image1": 0, "label": 1} outNames = ["loss"] willowOptPatterns = popart.Patterns(popart.PatternsLevel.All) def nllloss(logprobs, targets): targets = targets.unsqueeze(1)