def test_embedding_bwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="SQUAD", vocab_length=9728, micro_batch_size=1, hidden_size=768, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, update_embedding_dict=True) popart_model = Bert(config) # Prevent virtualGraph attributes being added to the ops sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = popart_model.builder.addInputTensor(sequence_info) positions = popart_model.builder.addInputTensor(sequence_info) segments = popart_model.builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), positions: np.random.randint( 0, config.max_positional_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), segments: np.random.randint( 0, 2, (config.micro_batch_size * config.sequence_length)).astype( np.uint32) } optimizer = popart.ConstSGD(0.01) l1_lambda = 0.1 with popart_model.builder.nameScope("Embedding"): output = popart_model.embedding(indices, positions, segments) l1 = popart_model.builder.aiGraphcore.l1loss( [output], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) num_reps = 5 proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output, ipus=1, loss=l1, num_reps=num_reps, optimizer=optimizer) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = [ data[t].reshape(config.micro_batch_size, config.sequence_length).astype(np.int32) for t in [indices, positions, segments] ] # ------------------- PyTorch ------------------------- torch_model = BertEmbeddings( TorchBertConfig(config.vocab_length, config.hidden_size, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, update_embedding_dict=config.update_embedding_dict)) # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {}) optim = torch.optim.SGD(torch_model.parameters(), 0.01) for _ in range(num_reps): torch_output = torch_model( *[torch.from_numpy(t).long() for t in inputs]) torch_loss = l1_lambda * torch.norm(torch_output, 1) torch_loss.backward() optim.step() optim.zero_grad() torch_outputs = [torch_output.detach().numpy()] check_tensors(torch_outputs, outputs, margin=7e-6) check_model(torch_model, post_proto, TORCH_TO_ONNX, {}, margin=7e-06)
def run_test(index, options): per_replica_batch_size = batch_size / options["replication"] model_input_shape = input_shape[:] model_input_shape[0] = int(model_input_shape[0] / options["replication"]) model_mask_shape = mask_shape[:] model_mask_shape[0] = int(model_mask_shape[0] / options["replication"]) stride = 2 // options["stages"] if "stride" in options and options["stride"]: stride = options["stride"] builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) mask = builder.addInputTensor( popart.TensorInfo("FLOAT", model_mask_shape), "mask") x_in = builder.addInputTensor( popart.TensorInfo("FLOAT", model_input_shape), "x_in") anchors = {} x = x_in for i in range(options["numLayers"]): qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}") anchors[popart.reservedGradientPrefix() + qkv] = popart.AnchorReturnType("All") vgid = (i % options["stages"]) if options["phasedExecution"] else i with builder.virtualGraph(vgid), builder.executionPhase(i * stride): x = builder.aiOnnx.matmul([x, qkv]) x = attention_onnx(builder, x, mask, per_replica_batch_size, sequence_length, hidden_size, attention_heads, qkv_length) vgid = ((options["numLayers"] - 1) % options["stages"] ) if options["phasedExecution"] else options["numLayers"] - 1 with builder.virtualGraph(vgid), builder.executionPhase( (options["numLayers"] - 1) * stride): l1 = builder.aiGraphcore.l1loss([x], 0.2, popart.ReductionType.Sum) proto = builder.getModelProto() gradient_keys = list(anchors.keys()) anchors[x] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batches_per_step, anchors) opts = popart.SessionOptions() opts.executionPhaseSettings.stages = options["stages"] opts.executionPhaseSettings.phases = ( options["numLayers"] * stride if options["phasedExecution"] else 0) opts.enableOutlining = options["outlining"] if "phaseSchedule" in options: opts.executionPhaseSettings.schedule = options["phaseSchedule"] # Phased execution currently does its own recompute annotations opts.autoRecomputation = (popart.RecomputationType.Standard if options["explicitRecomputation"] else popart.RecomputationType.NoRecompute) opts.outlineThreshold = -np.inf opts.enableOutliningCopyCostPruning = False opts.virtualGraphMode = (popart.VirtualGraphMode.ExecutionPhases if options["phasedExecution"] else popart.VirtualGraphMode.Manual) opts.explicitRecomputation = options["explicitRecomputation"] opts.aliasZeroCopy = options["aliasZeroCopy"] opts.batchSerializationSettings.factor = options["batchSerialize"] if "batchSchedule" in options: opts.batchSerializationSettings.batchSchedule = options[ "batchSchedule"] if "batchConcat" in options: # Do not concatenate the batch across phases and virtual graphs # (causes more, smalle transfers but allows for individual sub-batch # elements to be transferred) opts.batchSerializationSettings.concatOnVirtualGraphChange = options[ "batchConcat"] opts.batchSerializationSettings.concatOnExecutionPhaseChange = options[ "batchConcat"] # Wait with loading activations until they are required opts.executionPhaseSettings.activationIOSchedule = popart.ExecutionPhaseIOSchedule.OnDemand if "tensorLocationSettings" in options and options[ "tensorLocationSettings"]: opts.activationTensorLocationSettings = options[ "tensorLocationSettings"] opts.weightTensorLocationSettings = options[ "tensorLocationSettings"] opts.optimizerStateTensorLocationSettings = options[ "tensorLocationSettings"] opts.accumulatorTensorLocationSettings = options[ "tensorLocationSettings"] if "weightTensorLocationSettings" in options and options[ "weightTensorLocationSettings"]: opts.weightTensorLocationSettings = options[ "weightTensorLocationSettings"] if options["replication"] > 1: opts.replicatedGraphCount = options["replication"] opts.enableReplicatedGraphs = True if "ioTiles" in options: opts.numIOTiles = options["ioTiles"] pat = popart.Patterns(popart.PatternsLevel.Default) if options["phasedExecution"]: numIpus = options["stages"] else: numIpus = options["numLayers"] + 1 if options["replication"] > 1: numIpus = numIpus * options["replication"] device = tu.create_test_device(numIpus, pattern=popart.SyncPattern.Full) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.ConstSGD(0.1), patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for k, v in anchors.items(): print(f"anchor_before {k}={v.shape}") inputs = {x_in: input_data, mask: mask_data} stepio = popart.PyStepIO(inputs, anchors) for __ in range(10): session.run(stepio) session.modelToHost( str(tmpdir / f"streamingmemory_attention_{index}.onnx")) if options["replication"] > 1: for k, v in anchors.items(): if k in gradient_keys: # The gradient anchors will have an additional replication axis. anchors[k] = np.sum(v, 1 if batches_per_step > 1 else 0) else: # Output tensor needs reshaping. anchors[k] = np.reshape(anchors[k], [ batches_per_step, sequence_length * batch_size, hidden_size ]) for k, v in anchors.items(): print(f"anchor_after {k}={v.shape}") return anchors
def test_pipelined_recomputed_dropout(): dsize = 10 ratio = 0.5 ipus = 4 layers = 4 batches_per_step = 7 # Ensure inputs in range [1.0, 2.0] to ensure comparing with 0 is valid ip_shape = [dsize] ip_data = np.full([batches_per_step] + ip_shape, 1).astype(np.float32) dropouts = [] dropoutGrads = [] dropoutInputs = [] dropoutOutputs = [] builder = popart.Builder() ip = builder.addInputTensor(popart.TensorInfo("FLOAT", ip_shape)) def add_layer(layer_input, vgraph_num): # This is to get the output of the dropout in the bwd pass. # D_next_layer_in also includes the gradient of the AddOp. identity0 = builder.aiOnnx.identity([layer_input]) builder.virtualGraph(identity0, vgraph_num) [dropout0] = builder.aiOnnx.dropout([identity0], num_outputs=1, ratio=ratio) builder.virtualGraph(dropout0, vgraph_num) # the input to the forward pass dropout dropoutInputs.append(identity0) # the input to the backward pass dropout dropoutInputs.append(popart.reservedGradientPrefix() + dropout0) # the output of the backward pass dropout dropoutGrads.append(popart.reservedGradientPrefix() + identity0) # the output of the forward pass dropout dropouts.append(dropout0) relu0 = builder.aiOnnx.relu([dropout0]) builder.virtualGraph(relu0, vgraph_num) # This ensures the all input elements to the dropouts, in both # the forward and backward passes, will be non-zero. add0 = builder.aiOnnx.add([layer_input, dropout0]) builder.virtualGraph(add0, vgraph_num) return add0 # construct a graph of `layers` number of layers # with each layer on a different IPU. next_layer_in = ip for vgraph in range(layers): next_layer_in = add_layer(next_layer_in, vgraph) out = next_layer_in # TODO: use the tu.requires_ipu decorator if tu.ipu_available(ipus): device = tu.create_test_device(numIpus=ipus) else: pytest.skip("Test needs to run on IPU, but none are available") dfAnchors = {} for x in dropouts + dropoutGrads + dropoutInputs: dfAnchors[x] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batches_per_step, dfAnchors) loss = builder.aiGraphcore.identityloss([out]) builder.virtualGraph(loss, layers - 1) userOptions = popart.SessionOptions() userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual userOptions.enablePipelining = True userOptions.autoRecomputation = popart.RecomputationType.Pipeline session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.ConstSGD(0.1), loss=loss, userOptions=userOptions, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() session.setRandomSeed(0) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) print(anchors.keys()) # Check that none of the elements of the dropout inputs are zero for tid in dropoutInputs: x = anchors[tid] print(f'{tid}: {x}') zero = np.zeros(x.shape) assert not np.any(np.equal(x, zero)), \ f'Some elements of dropout input {tid} are zero' print() # For each dropout, check that the masked out elements are the same # in the forward and backward passes. for fwdId, bwdId in zip(dropouts, dropoutGrads): print(f'{fwdId}:\n{np.sign(anchors[fwdId])}') print(f'{bwdId}:\n{np.sign(anchors[bwdId])}') lhs = np.sign(anchors[fwdId]) rhs = np.sign(anchors[bwdId]) assert np.array_equal(lhs, rhs), \ f'{fwdId} and {bwdId} did not use the same dropout mask' print()
def test_virtual_graph4(): builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) with builder.virtualGraph(3): o1 = builder.aiOnnx.add([i1, i2]) o1l1 = builder.aiGraphcore.l1loss([o1], 0.1) o2 = builder.aiOnnx.add([i3, o1]) o2l1 = builder.aiGraphcore.l1loss([o2], 0.1) with builder.virtualGraph(2): o3 = builder.aiOnnx.mul([i1, i3]) o3l1 = builder.aiGraphcore.l1loss([o3], 0.1) with builder.virtualGraph(3): loss = builder.aiOnnx.sum([o1l1, o2l1, o3l1]) proto = builder.getModelProto() # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow( 1, { o1: popart.AnchorReturnType("All"), o2: popart.AnchorReturnType("All"), o3: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i1: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i2: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i3: popart.AnchorReturnType("All") }) optimizer = popart.ConstSGD(0.01) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual s = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, loss=loss, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=4)) s.prepareDevice() anchors = s.initAnchorArrays() data1 = np.ones([1], dtype=np.float32) data2 = np.ones([1], dtype=np.float32) data3 = np.ones([1], dtype=np.float32) inputs = {i1: data1, i2: data2, i3: data3} stepio = popart.PyStepIO(inputs, anchors) s.run(stepio) s.weightsFromHost()
def test_virtual_graph3(): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i4 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) with builder.virtualGraph(3): o1 = builder.aiOnnx.add([i1, i2]) o2 = builder.aiOnnx.add([i3, i4]) with builder.virtualGraph(2): o3 = builder.aiOnnx.add([o1, o2]) o = builder.aiOnnx.add([i1, o3]) o = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow( 1, { o: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i1: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i2: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i3: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i4: popart.AnchorReturnType("All") }) optimizer = popart.SGD({"defaultLearningRate": (0.01, True)}) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual s = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, loss=o, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=4)) s.prepareDevice() anchors = s.initAnchorArrays() data1 = np.ones([1], dtype=np.float32) data2 = np.ones([1], dtype=np.float32) data3 = np.ones([1], dtype=np.float32) data4 = np.ones([1], dtype=np.float32) inputs = {i1: data1, i2: data2, i3: data3, i4: data4} stepio = popart.PyStepIO(inputs, anchors) s.run(stepio) s.weightsFromHost()
def test_detach_grad_branches(detach_branch_popart, detach_branch_pytorch): # fix the random seed for this test np.random.seed(0) Batchsize = 8 Classes = 32 dshape = [Batchsize, 2, 4, 4] lshape = [Batchsize] wshape = [2, 2, 3, 3] ip_data = np.random.rand(*dshape).astype(np.float32) w1_data = np.random.rand(*wshape).astype(np.float32) w2_data = np.random.rand(*wshape).astype(np.float32) lb_data = np.random.randint(Classes, size=lshape) builder = popart.Builder() input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", dshape), "input_i1") lb = builder.addInputTensor(popart.TensorInfo("INT32", lshape)) w1 = builder.addInitializedInputTensor(w1_data) w2 = builder.addInitializedInputTensor(w2_data) conv1 = builder.aiOnnx.conv([input_, w1], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="conv") r1 = builder.reshape_const(builder.aiOnnx, [conv1], [Batchsize, Classes]) conv2 = builder.aiOnnx.conv([input_, w2], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="conv") r2 = builder.reshape_const(builder.aiOnnx, [conv2], [Batchsize, Classes]) if detach_branch_popart: r2 = builder.aiGraphcore.detach([r2]) add = builder.aiOnnx.sum([r1, r2]) o = builder.aiOnnx.softmax([add], axis=np.size(lb_data.shape)) loss = builder.aiGraphcore.nllloss([o, lb]) dataFlow = popart.DataFlow(1, [ o, loss, popart.reservedGradientPrefix() + o, popart.reservedGradientPrefix() + input_, w1, w2 ]) opts = popart.SessionOptions() session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, loss=loss, optimizer=popart.ConstSGD(LEARNING_RATE, WEIGHT_DECAY), userOptions=opts, deviceInfo=popart.DeviceManager().createIpuModelDevice({})) session.prepareDevice() anchors = session.initAnchorArrays() stepio = popart.PyStepIO({input_: ip_data, lb: lb_data}, anchors) session.weightsFromHost() # Torch class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(2, 2, 3, padding=[1, 1], bias=False) self.conv2 = nn.Conv2d(2, 2, 3, padding=[1, 1], bias=False) self.conv1.weight.data = torch.tensor(w1_data) self.conv2.weight.data = torch.tensor(w2_data) # PyTorch nll loss expects logsoftmax input self.sm = nn.LogSoftmax(dim=np.size(lb_data.shape)) self.nll = nn.NLLLoss() def forward(self, x, y): x1 = self.conv1(x) x1 = torch.reshape(x1, [Batchsize, Classes]) if detach_branch_pytorch: with torch.no_grad(): x2 = self.conv2(x) else: x2 = self.conv2(x) x2 = torch.reshape(x2, [Batchsize, Classes]) x = x1 + x2 x = self.sm(x) x = self.nll(x, y) return x net = Net() optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) input_t = torch.tensor(ip_data, requires_grad=True, dtype=torch.float32) label_t = torch.tensor(lb_data, requires_grad=False, dtype=torch.long) for step in range(4): print(f"Step {step +1}") session.run(stepio) # Torch # optimizer.zero_grad() loss = net(input_t, label_t) loss.backward() optimizer.step() print(detach_branch_popart, detach_branch_pytorch) print("Popart: w1", np.mean(anchors[w1])) print("PyTorch: w1", np.mean(net.conv1.weight.data.numpy())) print("Popart: w2", np.mean(anchors[w2])) print("PyTorch: w2", np.mean(net.conv2.weight.data.numpy())) # Check the weights match if the branches are the same, if not, # make sure the right hand branch doesn't match if detach_branch_popart == detach_branch_pytorch: assert np.allclose(anchors[w1], net.conv1.weight.data.numpy(), 1e-4) assert np.allclose(anchors[w2], net.conv2.weight.data.numpy(), 1e-4) else: assert not np.allclose(anchors[w2], net.conv2.weight.data.numpy(), 1e-4)
def get_model_anchors_model2(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False, labelArray=None): np.random.seed(1234) builder = popart.Builder() micro_batch_size = batch_size // gradAcclFactor shape_d0 = [micro_batch_size, 2, 4, 4] shape_l0 = [batch_size] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp") data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0, "weights") s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugContext="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm") label_shape = [micro_batch_size] l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape), "label") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep label = np.repeat(label[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor label = label.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. shape_d0.insert(0, outer_dim) data = np.ones(shape=shape_d0).astype(np.float32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() for i in range(6): session.run(stepio) if returnRawInput is True: anchors["input_raw"] = data return anchors
def create_model(batch_size): """ Create an ONNX protobuf description of a simple model. This function uses the popart library builder functions to create the ONNX description directly. An alternative would be to load an exported ONNX protobuf from a file. """ builder = popart.Builder() input_shape = popart.TensorInfo('FLOAT', [batch_size, 1, ROWS, COLS]) input_t = builder.addInputTensor(input_shape) x = input_t init_weights = kaiming_init([20, 1, 5, 5], 1 * 5 * 5) W1 = builder.addInitializedInputTensor(init_weights) init_weights = kaiming_init([20], 1 * 5 * 5, 1, 1) b1 = builder.addInitializedInputTensor(init_weights) x = builder.aiOnnx.conv([x, W1, b1], dilations=[1, 1], kernel_shape=[5, 5], strides=[1, 1], pads=[0, 0, 0, 0]) x = builder.aiOnnx.relu([x]) (x, ) = builder.aiOnnx.maxpool([x], num_outputs=1, kernel_shape=[2, 2], pads=[0, 0, 0, 0], strides=[2, 2]) init_weights = kaiming_init([50, 20, 5, 5], 20 * 5 * 5) W2 = builder.addInitializedInputTensor(init_weights) init_weights = kaiming_init([50], 20 * 5 * 5, 1, 1) b2 = builder.addInitializedInputTensor(init_weights) x = builder.aiOnnx.conv([x, W2, b2], dilations=[1, 1], kernel_shape=[5, 5], strides=[1, 1], pads=[0, 0, 0, 0]) x = builder.aiOnnx.relu([x]) (x, ) = builder.aiOnnx.maxpool([x], num_outputs=1, kernel_shape=[2, 2], pads=[0, 0, 0, 0], strides=[2, 2]) shape = builder.aiOnnx.constant(np.asarray([batch_size, 50 * 4**2])) x = builder.aiOnnx.reshape([x, shape]) init_weights = kaiming_init([50 * 4**2, 500], 50 * 4**2) W3 = builder.addInitializedInputTensor(init_weights) init_weights = kaiming_init([500], 50 * 4**2, 1, 1) b3 = builder.addInitializedInputTensor(init_weights) x = builder.aiOnnx.matmul([x, W3]) x = builder.aiOnnx.add([x, b3]) x = builder.aiOnnx.relu([x]) init_weights = kaiming_init([500, 10], 500) W4 = builder.addInitializedInputTensor(init_weights) init_weights = kaiming_init([10], 500, 1, 1) b4 = builder.addInitializedInputTensor(init_weights) x = builder.aiOnnx.matmul([x, W4]) output_t = builder.aiOnnx.add([x, b4]) builder.addOutputTensor(output_t) probs = builder.aiOnnx.softmax([output_t]) label_shape = popart.TensorInfo('INT32', [batch_size]) label = builder.addInputTensor(label_shape) loss = popart.NllLoss(probs, label, 'nllLossVal') proto = builder.getModelProto() return proto, input_t, label, output_t, loss
def conv_settings(capfd, operation): builder = popart.Builder() input_shape = popart.TensorInfo("FLOAT", [1, 2, 4, 4]) weight_shape = popart.TensorInfo("FLOAT", [3, 2, 3, 3]) weight_data = np.ones(weight_shape.shape(), np.float32) input_ = builder.addInputTensor(input_shape) weights = builder.addInitializedInputTensor(weight_data) act = builder.aiOnnx.conv([input_, weights], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1]) o = builder.aiOnnx.relu([act]) loss = builder.aiGraphcore.identityloss([o]) operation(builder, act=act, o=o) anchor_names = [ o, popart.reservedGradientPrefix() + input_, popart.reservedGradientPrefix() + weights ] training_dataFlow = popart.DataFlow( 1, { anchor_names[0]: popart.AnchorReturnType("All"), anchor_names[1]: popart.AnchorReturnType("All"), anchor_names[2]: popart.AnchorReturnType("All") }) opts = popart.SessionOptions() opts.constantWeights = False # Allow the weights to be updated # Create the device device = tu.create_test_device(1, opts={"compileIPUCode": True}) device.attach() # Prepare the input data input_data = np.random.random_sample(input_shape.shape()).astype( np.float32) # Prepare the Training session training_session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=training_dataFlow, loss=loss, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) # Compile the training graph training_session.prepareDevice() # Run the training session training_session.weightsFromHost() training_anchors = training_session.initAnchorArrays() training_inputs = {input_: input_data} training_session.run(popart.PyStepIO(training_inputs, training_anchors)) captured = capfd.readouterr() return captured.err
def test_basic(): builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1, 2, 32, 32])) old_o = "" o = builder.aiOnnx.abs([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.acos([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.acosh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.add([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.logical_and([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.asin([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.asinh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.atan([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.atanh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.ceil([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.cos([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.cosh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.div([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.elu([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.equal([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.exp([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.floor([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.greater([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.identity([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.less([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.log([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.logsoftmax([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.max([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.mean([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.min([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.mul([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.neg([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.logical_not([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.logical_or([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.pow([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.reciprocal([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.relu([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sigmoid([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sin([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sinh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.softsign([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.softmax([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sqrt([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sub([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.sum([i1, i2]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.tan([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.tanh([i1]) assert (old_o != o) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) old_o = o o = builder.aiOnnx.logical_xor([i1, i2]) assert (builder.getTensorShape(o) == [1, 2, 32, 32]) assert (old_o != o) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.abs([]) assert (e_info.value.args[0].startswith("Abs has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.acos([]) assert (e_info.value.args[0].startswith("Acos has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.acosh([]) assert (e_info.value.args[0].startswith("Acosh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.add([]) assert (e_info.value.args[0].startswith("Add has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.logical_and([]) assert (e_info.value.args[0].startswith("And has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.asin([]) assert (e_info.value.args[0].startswith("Asin has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.asinh([]) assert (e_info.value.args[0].startswith("Asinh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.atan([]) assert (e_info.value.args[0].startswith("Atan has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.atanh([]) assert (e_info.value.args[0].startswith("Atanh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.ceil([]) assert (e_info.value.args[0].startswith("Ceil has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.cos([]) assert (e_info.value.args[0].startswith("Cos has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.cosh([]) assert (e_info.value.args[0].startswith("Cosh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.div([]) assert (e_info.value.args[0].startswith("Div has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.elu([]) assert (e_info.value.args[0].startswith("Elu has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.equal([]) assert (e_info.value.args[0].startswith("Equal has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.exp([]) assert (e_info.value.args[0].startswith("Exp has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.floor([]) assert (e_info.value.args[0].startswith("Floor has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.greater([]) assert (e_info.value.args[0].startswith("Greater has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.identity([]) assert (e_info.value.args[0].startswith("Identity has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.less([]) assert (e_info.value.args[0].startswith("Less has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.log([]) assert (e_info.value.args[0].startswith("Log has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.max([]) assert (e_info.value.args[0].startswith("Max has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.mean([]) assert (e_info.value.args[0].startswith("Mean has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.min([]) assert (e_info.value.args[0].startswith("Min has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.mul([]) assert (e_info.value.args[0].startswith("Mul has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.neg([]) assert (e_info.value.args[0].startswith("Neg has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.logical_not([]) assert (e_info.value.args[0].startswith("Not has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.logical_or([]) assert (e_info.value.args[0].startswith("Or has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.pow([]) assert (e_info.value.args[0].startswith("Pow has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.reciprocal([]) assert ( e_info.value.args[0].startswith("Reciprocal has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.relu([]) assert (e_info.value.args[0].startswith("Relu has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sigmoid([]) assert (e_info.value.args[0].startswith("Sigmoid has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sin([]) assert (e_info.value.args[0].startswith("Sin has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sinh([]) assert (e_info.value.args[0].startswith("Sinh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.softsign([]) assert (e_info.value.args[0].startswith("Softsign has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sqrt([]) assert (e_info.value.args[0].startswith("Sqrt has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sub([]) assert (e_info.value.args[0].startswith("Sub has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.sum([]) assert (e_info.value.args[0].startswith("Sum has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.tan([]) assert (e_info.value.args[0].startswith("Tan has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.tanh([]) assert (e_info.value.args[0].startswith("Tanh has invalid number of")) with pytest.raises(popart.popart_exception) as e_info: builder.aiOnnx.logical_xor([]) assert (e_info.value.args[0].startswith("Xor has invalid number of")) proto = builder.getModelProto() assert (len(proto) > 0) assert (len(i1) > 0) assert (len(i2) > 0) assert (len(o) > 0) assert (i1 != i2) assert (i2 != o) with pytest.raises(TypeError) as e_info: builder.aiOnnx.add(0, 0) assert (e_info.value.args[0].startswith("add(): incompatible function"))
op_tester.atol = 1e-06 op_tester.rtol = 1e-05 op_tester.run(init_builder, reference, 'train') if __name__ == "__main__": builder = popart.Builder() d1 = np.random.randint(0, 20, size=(2, 2, 3)).astype(np.float32) input_size = d1.shape[2] # (2,2,3) hidden_size = 7 d2 = np.random.rand(1, 3 * hidden_size, input_size).astype(np.float32) d3 = np.random.rand(1, 3 * hidden_size, hidden_size).astype(np.float32) i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", d1.shape)) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", d2.shape)) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", d3.shape)) Y, Y_h = builder.aiOnnx.gru([i1, i2, i3], 2, clip=None, direction="bidirectional") builder.addOutputTensor(Y) dataFlow = popart.DataFlow(1, {Y: popart.AnchorReturnType("All")}) # Create a session to compile and the graph for inference #------------------------------------------------------------------------------ inferenceOptions = popart.SessionOptions() # Need to compile the inference graph with variable weights we they can be updated # before execution
def test_is_initializer(): builder = popart.Builder() i0 = builder.addInputTensor(popart.TensorInfo("FLOAT", [10, 9, 8, 7])) i1 = builder.addInitializedInputTensor(np.array([1, 6], dtype=np.int64)) assert builder.isInitializer(i0) == False assert builder.isInitializer(i1) == True
def popart_result_and_model(config, mode, weight_transposed, is_bwd=False): """Run popart model based on config. Args: config (BertConfig): Popart config. weight_transposed: Construct embedding dict transposed. is_bwd (bool, optional): Construct training graph if True, else inference graph. Defaults to False. Returns: Tuple: Gathered numpy data, outputs from model, proto, post_proto """ scope_provider = ScopeProvider() user_options = {} if mode == ExecutionMode.PHASED: builder = popart.Builder() indices_len = config.batch_size * config.sequence_length sequence_info = popart.TensorInfo("UINT32", [indices_len]) indices = builder.addInputTensor(sequence_info) data = {indices: np.random.randint(0, config.vocab_length, (indices_len)).astype(np.uint32)} popart_model = EmbeddingSerialised(scope_provider.get_scope('Token'), input_dim=config.vocab_length, output_dim=config.hidden_size, num_splits=config.embedding_serialization_vocab_steps, custom=True, dtype=config.dtype, detach=not config.update_embedding_dict, weight_transposed=weight_transposed, builder=builder, scope_provider=scope_provider) user_options = { "batchSerializationFactor": 1, "executionPhases": popart_model.total_execution_phases } output = popart_model(indices) else: popart_model = get_model(config, mode, block="embedding", initializers={}) builder = popart_model.builder indices_len = config.batch_size * config.sequence_length sequence_info = popart.TensorInfo("UINT32", [indices_len]) indices = builder.addInputTensor(sequence_info) data = {indices: np.random.randint(0, config.vocab_length, (indices_len)).astype(np.uint32)} output = popart_model.word_embedding_serialized(indices, num_splits) if is_bwd: l1_lambda = 0.1 if mode == ExecutionMode.PHASED: loss_scope = scope_provider.get_scope('Loss', 'prev') with popart_model.scope_provider(popart_model.builder, loss_scope): l1_loss = popart_model.builder.aiGraphcore.l1loss([output], l1_lambda, debugPrefix="l1LossVal", reduction=popart.ReductionType.Sum) else: l1_loss = popart_model.builder.aiGraphcore.l1loss([output], l1_lambda, debugPrefix="l1LossVal", reduction=popart.ReductionType.Sum) proto = builder.getModelProto() optimizer = popart.ConstSGD(0.01) outputs, post_proto = run_py(proto, data, (output, l1_loss), loss=l1_loss, optimizer=optimizer, user_options=user_options, execution_mode=mode) else: proto = builder.getModelProto() outputs, post_proto = run_py(proto, data, output, user_options=user_options, execution_mode=mode) return [data[indices]], outputs, proto, post_proto
def test_embedding_fwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="SQUAD", vocab_length=9728, micro_batch_size=1, hidden_size=768, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, inference=True) popart_model = Bert(config) sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = popart_model.builder.addInputTensor(sequence_info) positions = popart_model.builder.addInputTensor(sequence_info) segments = popart_model.builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), positions: np.random.randint( 0, config.max_positional_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), segments: np.random.randint( 0, 2, (config.micro_batch_size * config.sequence_length)).astype( np.uint32) } user_options = {"enableStochasticRounding": True} with popart_model.builder.nameScope("Embedding"): output = popart_model.embedding(indices, positions, segments) proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output, user_options=user_options) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = [ data[t].reshape(config.micro_batch_size, config.sequence_length).astype(np.int32) for t in [indices, positions, segments] ] # ------------------- PyTorch ------------------------- torch_model = BertEmbeddings( TorchBertConfig(config.vocab_length, config.hidden_size, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps)) torch_model.eval() copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {}) torch_outputs = run_fwd_model(inputs, torch_model) check_tensors(torch_outputs, outputs, margin=5e-7)
def test(config, iteration, true_scaling, test_case): builder = popart.Builder() w0name = "weight_0" w1name = "weight_1" w2name = "weight_2" input0Shape = [1, 1, 1] input0 = builder.addInputTensor( popart.TensorInfo("FLOAT", input0Shape), "input0") w0data = np.array([test_case[0][0]], dtype=np.float32) w0R = np.empty([ 1, ], dtype=np.float32) w0Id = builder.addInitializedInputTensor(w0data, w0name) w1data = np.array([test_case[1][0]], dtype=np.float32) w1R = np.empty([ 1, ], dtype=np.float32) w1Id = builder.addInitializedInputTensor(w1data, w1name) w2data = np.array([test_case[2][0]], dtype=np.float32) w2R = np.empty([ 1, ], dtype=np.float32) w2Id = builder.addInitializedInputTensor(w2data, w2name) add0 = builder.aiOnnx.add([w0Id, input0]) add1 = builder.aiOnnx.add([w1Id, add0]) add2 = builder.aiOnnx.add([w2Id, add1]) loss = builder.aiGraphcore.l1loss([add2], 1.0, debugContext="l1LossVal") builder.addOutputTensor(add2) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} pat = popart.Patterns(popart.PatternsLevel.Default) dm = popart.DeviceManager() dm.setOnDemandAttachTimeout(int(1e4)) device = dm.acquireAvailableDevice( 1, connectionType=popart.DeviceConnectionType.OnDemand, selectionCriterion=popart.DeviceSelectionCriterion.Random) if device is None: raise OSError("Failed to acquire IPU.") # The stage->tensor map would come from the Bert model in reality # (see model.tensors) mock_tensor_map = {0: [w0Id], 1: [w1Id], 2: [w2Id]} factory = ScheduledOptimizerFactory(config, iteration, tensors=mock_tensor_map) assert_scaled_lr(factory, true_scaling) optimizer_step0 = factory.create() session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=optimizer_step0, patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input_data = np.array([3.1415], dtype=np.float32) stepio = popart.PyStepIO({input0: input_data}, anchors) for step in range(iteration.total_steps): session.run(stepio) session.weightsToHost() weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R}) session.readWeights(weightsRead) assert (np.isclose(test_case[0][step + 1], w0R)) assert (np.isclose(test_case[1][step + 1], w1R)) assert (np.isclose(test_case[2][step + 1], w2R)) iteration.count += 1 if factory.should_update(iteration): optimizer_step1 = factory.update_and_create(iteration) assert_scaled_lr(factory, true_scaling) session.updateOptimizerFromHost(optimizer_step1)
def sparse_mm_infer(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size, sparsity_level, transpose_rhs, memory_cycle_ratio, inner_group_size): """ """ if transpose_rhs: matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2]] else: matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1]] lhs = create_dense_matrix(lhs_dims) if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']: bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix(vanilla_rhs_dims, block_size[1:], sparsity_level) rhs = bsr_rhs rhs_dims = bsr_rhs.shape elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']: output_dims = lhs_dims[:-1] output_dims.append(vanilla_rhs_dims[-1]) output_block_size = [block_size[0], block_size[2]] bsr_output, lengths_per_2d_plane, _, sparsity_mask = create_sparse_matrix(output_dims, output_block_size, sparsity_level) rhs_dims = vanilla_rhs_dims rhs = create_dense_matrix(rhs_dims) # Create a builder and construct a graph builder = popart.Builder() lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims) rhs_tensorInfo = popart.TensorInfo("FLOAT", rhs_dims) lhsTensor = builder.addInputTensor(lhs_tensorInfo) rhsTensor = builder.addInputTensor(rhs_tensorInfo) outTensor = builder.customOp(opName = "BSMatMul", opVersion=1, domain = "ai.graphcore", inputs = [lhsTensor, rhsTensor], attributes = { "bsr_rhs_lengths_per_2d_plane": lengths_per_2d_plane.tolist(), "matrix_dims": matmul_dims, "block_size": block_size, "sparsity_mask": sparsity_mask.tolist(), "bsmatmul_type": sparse_mm_type, "transpose_rhs": transpose_rhs, "memory_cycle_ratio": memory_cycle_ratio, "inner_group_size": inner_group_size, "in_type": g_input_data_type, "out_type": g_output_data_type, "pp_type": g_pp_data_type })[0] builder.addOutputTensor(outTensor) proto = builder.getModelProto() # Describe how to run the model dataFlow = popart.DataFlow(1, {outTensor: popart.AnchorReturnType("ALL")}) # Create a session to compile and execute the graph session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1)) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() rhs = np.array(rhs, dtype=g_input_data_type) stepio = popart.PyStepIO({lhsTensor: lhs, rhsTensor: rhs}, anchors) session.run(stepio) ipuOutput = anchors[outTensor] if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']: if transpose_rhs: transpose_indices = list(range(len(vanilla_rhs_dims))) transpose_indices[-2], transpose_indices[-1] = transpose_indices[-1], transpose_indices[-2] vanilla_rhs = vanilla_rhs.transpose(tuple(transpose_indices)) goldOutput = mm(lhs, vanilla_rhs) else: goldOutput = mm(lhs, vanilla_rhs) else: assert len(lhs.shape) == len(rhs.shape) if(len(lhs.shape) == 2): lhs = np.expand_dims(lhs, 0) rhs = np.expand_dims(rhs, 0) mmOutput = mm(lhs, rhs) totalGroupDims = int(np.prod(lhs_dims[:-2])) num_rows_sparsity_mask_2d = output_dims[-2] // block_size[0] num_cols_sparsity_mask_2d = output_dims[-1] // block_size[2] assert sparsity_mask.shape == (totalGroupDims * num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d,) mmOutput = mmOutput.reshape((totalGroupDims, lhs_dims[-2], rhs_dims[-1])) goldOutput = [] for dim in range(totalGroupDims): offset = num_rows_sparsity_mask_2d * num_cols_sparsity_mask_2d mmOutput_2d = mmOutput[dim] sliced_sparsity_mask = sparsity_mask[dim * offset: dim * offset + offset] for sparsity_mask_idx in range(len(sliced_sparsity_mask)): if sliced_sparsity_mask[sparsity_mask_idx]: mmOutput_2d_row_start = (sparsity_mask_idx // num_cols_sparsity_mask_2d) * block_size[0] mmOutput_2d_row_end = mmOutput_2d_row_start + block_size[0] mmOutput_2d_col_start = (sparsity_mask_idx % num_cols_sparsity_mask_2d) * block_size[2] mmOutput_2d_col_end = mmOutput_2d_col_start + block_size[2] mmOutput_2d_sliced = mmOutput_2d[mmOutput_2d_row_start: mmOutput_2d_row_end, mmOutput_2d_col_start: mmOutput_2d_col_end] goldOutput.append(mmOutput_2d_sliced.reshape(block_size[0] * block_size[2])) goldOutput = np.array(goldOutput) return ipuOutput, goldOutput
def run(model_file_name, explicit_recompute=True): dsize = 10 builder = popart.Builder() ip = builder.addInputTensor(popart.TensorInfo("FLOAT", [dsize, dsize])) d__ip = popart.reservedGradientPrefix() + ip def add_layer(in_id): np.random.seed(1) scaler = 0.01 w = builder.addInitializedInputTensor( np.random.randn(dsize, dsize).astype(np.float32) * scaler) b = builder.addInitializedInputTensor( np.zeros((dsize, 1)).astype(np.float32)) matmul_id = builder.aiOnnxOpset10.gemm([in_id, w, b]) return matmul_id if explicit_recompute: with builder.recomputeOutput(popart.RecomputeType.Recompute): m1 = add_layer(ip) m2 = add_layer(m1) m3 = add_layer(m2) else: m1 = add_layer(ip) m2 = add_layer(m1) m3 = add_layer(m2) anchorIds = [] for i in (ip, m1, m2, m3): anchorIds.append(popart.reservedGradientPrefix() + i) out = builder.aiGraphcore.identityloss([m3]) builder.addOutputTensor(out) device = tu.create_test_device() dataflow_anchors = {} for anchorId in anchorIds: dataflow_anchors.update({anchorId: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.explicitRecomputation = explicit_recompute proto = builder.getModelProto() session = popart.TrainingSession( fnModel=proto, dataFlow=popart.DataFlow(1, dataflow_anchors), optimizer=popart.ConstSGD(0.01), loss=out, patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() ip_data = np.ones((dsize, dsize), dtype=np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) session.modelToHost(str(tmpdir / model_file_name))
def sparse_mm_train(sparse_mm_type, lhs_dims, vanilla_rhs_dims, block_size, sparsity_level, transpose_rhs, memory_cycle_ratio, inner_group_size): if transpose_rhs: matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-1], vanilla_rhs_dims[-2]] else: matmul_dims = [lhs_dims[-2], vanilla_rhs_dims[-2], vanilla_rhs_dims[-1]] lhs = create_dense_matrix(lhs_dims) if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']: bsr_rhs, lengths_per_2d_plane, vanilla_rhs, sparsity_mask = create_sparse_matrix(vanilla_rhs_dims, block_size[1:], sparsity_level) rhs = bsr_rhs rhs_dims = bsr_rhs.shape elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']: output_dims = lhs_dims[:-1] output_dims.append(vanilla_rhs_dims[-1]) output_block_size = [block_size[0], block_size[2]] bsr_output, lengths_per_2d_plane, vanilla_output, sparsity_mask = create_sparse_matrix(output_dims, output_block_size, sparsity_level) lhs_inv = np.linalg.inv(lhs) rhs = np.matmul(lhs_inv, vanilla_output) rhs_dims = vanilla_rhs_dims # MODEL CREATION builder = popart.Builder() lhs_tensorInfo = popart.TensorInfo("FLOAT", lhs_dims) lhsTensor = builder.addInputTensor(lhs_tensorInfo) rhsTensor = builder.addInitializedInputTensor(rhs) outTensor = builder.customOp(opName = "BSMatMul", opVersion=1, domain = "ai.graphcore", inputs = [lhsTensor, rhsTensor], attributes = { "bsr_rhs_lengths_per_2d_plane": lengths_per_2d_plane.tolist(), "matrix_dims": matmul_dims, "block_size": block_size, "sparsity_mask": sparsity_mask.tolist(), "bsmatmul_type": sparse_mm_type, "transpose_rhs": transpose_rhs, "memory_cycle_ratio": memory_cycle_ratio, "inner_group_size": inner_group_size, "in_type": g_input_data_type, "out_type": g_output_data_type, "pp_type": g_pp_data_type })[0] builder.addOutputTensor(outTensor) probs = builder.aiOnnx.softmax([outTensor], axis=1) if sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_SPARSE_RHS_DENSE_OUT']: labels_shape = lhs_dims[:-1] elif sparse_mm_type == g_sparseMatMulTypeLookup['DENSE_LHS_DENSE_RHS_SPARSE_OUT']: labels_shape = [np.sum(sparsity_mask)] label_tensorInfo = popart.TensorInfo("INT32", labels_shape) labelTensor = builder.addInputTensor(label_tensorInfo) loss = builder.aiGraphcore.nllloss([probs, labelTensor], debugPrefix = "nllLossVal") proto = builder.getModelProto() ####################### # Describe how to run the model anchor_desc = { outTensor: popart.AnchorReturnType("ALL"), loss: popart.AnchorReturnType("ALL") } dataFlow = popart.DataFlow(1, anchor_desc) label_data = g_random_labels.choice(9, labels_shape) session = popart.TrainingSession(fnModel=proto, loss=loss, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1), optimizer=popart.ConstSGD(0.01), dataFlow=dataFlow) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # TRAINING session.weightsFromHost() stepio = popart.PyStepIO({ lhsTensor: lhs, labelTensor: label_data}, anchors) session.run(stepio)
def get_model_anchors_model1(doSharding, doPipelining, batchesPerStep, doTraining, doGradAccl=False, gradAcclFactor=1, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, labelArray=None): micro_batch_size = batch_size // gradAcclFactor builder = popart.Builder() input_shape = [micro_batch_size, hidden_size] input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape)) x = input_ with builder.virtualGraph(0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_0_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(1 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_1_{i}") x = builder.aiOnnx.matmul([x, w]) with builder.virtualGraph(2 if doSharding else 0): for i in range(2): w = builder.addInitializedInputTensor( np.ones([hidden_size, hidden_size]).astype(np.float32), f"weight_2_{i}") if i == 1: w0 = w x = builder.aiOnnx.matmul([x, w]) label = builder.addInputTensor("INT32", [micro_batch_size]) x = builder.aiGraphcore.nllloss([x, label]) output = x builder.addOutputTensor(output) art = popart.AnchorReturnType("All") anchor_map = {x: art, w0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + x] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + x] = art anchor_map[popart.reservedRestoredPrefix() + w0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradAccl opts.accumulationFactor = gradAcclFactor opts.virtualGraphMode = popart.VirtualGraphMode.Manual if doSharding is False: numIPUs = 1 else: numIPUs = 3 if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=output, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() outer_dim = 1 if batchesPerStep > 1: # Add an outer dimension of batchesPerStep. We repeat the labels # as we want consistency if we have different shape inputs between examples. outer_dim *= batchesPerStep labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0) if gradAcclFactor > 1: # Divide up the batches per step batches into gradAcclFactor * batchesPerStep # samples. outer_dim *= gradAcclFactor labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1]) if outer_dim > 1: # Add the gradAcclFactor * batchesPerStep dimension into the input. input_shape = [outer_dim] + input_shape stepio = popart.PyStepIO( { input_: np.ones(input_shape, np.float32), label: labelArray.astype(np.int32) }, anchors) session.weightsFromHost() session.run(stepio) return anchors
def sparse_softmax(dims, block_size, sparsity_level, inner_group_size): """ """ sparse_input, lengths_per_2d_plane, dense_input, sparsity_mask = create_sparse_matrix(dims, block_size, sparsity_level, -1000) # Create a builder and construct a graph builder = popart.Builder() tensor_info = popart.TensorInfo("FLOAT", sparse_input.shape) input_tensor = builder.addInputTensor(tensor_info) output_tensor = builder.customOp(opName = "BsSoftmax", opVersion = 1, domain = "ai.graphcore", inputs = [input_tensor], attributes = { "matrixDims": dims, "blockSize": block_size, "sparsity": sparsity_mask.tolist(), "groupSizes": lengths_per_2d_plane.tolist(), "innerGroupSize": inner_group_size, "subBlockMaskPerGroup": "None" * len(lengths_per_2d_plane) })[0] builder.addOutputTensor(output_tensor) proto = builder.getModelProto() # Describe how to run the model dataFlow = popart.DataFlow(1, {output_tensor: popart.AnchorReturnType("ALL")}) # Create a session to compile and execute the graph session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, deviceInfo=popart.DeviceManager().acquireAvailableDevice(1)) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() sparse_input = np.array(sparse_input, dtype=g_input_data_type) stepio = popart.PyStepIO({input_tensor: sparse_input}, anchors) session.run(stepio) ipu_output = anchors[output_tensor] group_dims = dims[:-2] mat_dims = dims[-2:] blocks_2d = [mat_dims[0] // block_size[0], mat_dims[1] // block_size[1]] num_blocks_2d = blocks_2d[0] * blocks_2d[1] block_area = block_size[0] * block_size[1] total_group_dims = int(np.prod(group_dims)) assert sparsity_mask.shape == (total_group_dims * num_blocks_2d,) cpu_output = softmax(dense_input) np.set_printoptions(precision=2) np.set_printoptions(suppress=True) cpu_output = cpu_output.reshape([total_group_dims, blocks_2d[0], block_size[0], blocks_2d[1], block_size[1]]) cpu_output = np.transpose(cpu_output, [0, 1, 3, 2, 4]) cpu_output = cpu_output.reshape(total_group_dims, num_blocks_2d, block_area) gold_output = [] offset = 0 for g in range(total_group_dims): cpu_output_2d = cpu_output[g] sliced_sparsity_mask = sparsity_mask[offset: offset + num_blocks_2d] offset = offset + num_blocks_2d for sparsity_mask_idx in range(num_blocks_2d): if sliced_sparsity_mask[sparsity_mask_idx]: gold_output.append(cpu_output_2d[sparsity_mask_idx]) gold_output = np.array(gold_output) assert ipu_output.shape == gold_output.shape return ipu_output, gold_output
def bwd_graph(popart_model, torch_model, popart_loss_fn, torch_loss_fn, mapping=None, transform=None): np.random.seed(1984) random.seed(1984) torch.manual_seed(1984) # ------------------- PopART -------------------- config = popart_model.config builder = popart_model.builder sequence_info = popart.TensorInfo( "UINT32", [config.batch_size * config.sequence_length]) indices = builder.addInputTensor(sequence_info) positions = builder.addInputTensor(sequence_info) segments = builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (config.batch_size * config.sequence_length)).astype(np.uint32), positions: np.random.randint( 0, config.sequence_length, (config.batch_size * config.sequence_length)).astype(np.uint32), segments: np.random.randint( 0, 2, (config.batch_size * config.sequence_length)).astype(np.uint32) } output = popart_model.build_graph(indices, positions, segments) proto = builder.getModelProto() losses = popart_loss_fn(output) optimizer = popart.ConstSGD(0.01) outputs, post_proto = run_py( proto, data, output, loss=losses, optimizer=optimizer, ipus=math.ceil(config.num_layers / config.layers_per_ipu) + popart_model.layer_offset) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = { "input_ids": data[indices].reshape(config.batch_size, config.sequence_length).astype(np.int32), "position_ids": data[positions].reshape(config.batch_size, config.sequence_length).astype(np.int32), "token_type_ids": data[segments].reshape(config.batch_size, config.sequence_length).astype(np.int32) } torch_to_onnx = get_mapping(config, init=mapping) transform_weights = get_transform(config, init=transform) # ------------------- PyTorch ------------------------- # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, torch_to_onnx, transform_weights) optim = torch.optim.SGD(torch_model.parameters(), 0.01, weight_decay=0.0, momentum=0.0) torch_outputs = torch_model( **{k: torch.from_numpy(t).long() for k, t in inputs.items()}) torch_loss = torch_loss_fn(torch_outputs) torch_loss.backward() optim.step() check_tensors([output.detach().numpy() for output in torch_outputs], outputs) check_model(torch_model, post_proto, torch_to_onnx, transform_weights, margin=6e-7)
def test(opt0, opt1, e0, e1, e2): builder = popart.Builder() input0Shape = [stepSize, batchSize, sampleDim] input0 = builder.addInputTensor( popart.TensorInfo("FLOAT", input0Shape), "input0") w0data = np.array([100.0], dtype=np.float32) w0R = np.array([-777.0], dtype=np.float32) w0Id = builder.addInitializedInputTensor(w0data, w0name) w1data = np.array([200.0], dtype=np.float32) w1R = np.array([-777.0], dtype=np.float32) w1Id = builder.addInitializedInputTensor(w1data, w1name) w2data = np.array([300.0], dtype=np.float32) w2R = np.array([-777.0], dtype=np.float32) w2Id = builder.addInitializedInputTensor(w2data, w2name) add0 = builder.aiOnnx.add([w0Id, input0]) add1 = builder.aiOnnx.add([w1Id, add0]) add2 = builder.aiOnnx.add([w2Id, add1]) l1 = builder.aiGraphcore.l1loss([add2], 1.0) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enableGroupedMatmuls = False pat = popart.Patterns(popart.PatternsLevel.Default) session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow,\ userOptions=opts, loss=l1, optimizer=opt0, patterns=pat, deviceInfo=tu.create_test_device(opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input0Data = np.array([3.1415], dtype=np.float32) stepio = popart.PyStepIO({input0: input0Data}, anchors) session.run(stepio) session.updateOptimizerFromHost(opt1) session.run(stepio) session.weightsToHost() weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R}) session.readWeights(weightsRead) assert (np.isclose(e0['initalValue'], w0R)) assert (np.isclose(e1['initalValue'], w1R)) assert (np.isclose(e2['initalValue'], w2R))
def run_test(enablePipelining): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor( popart.TensorInfo("FLOAT", input_data.shape[1::])) w0 = builder.addInitializedInputTensor(weight_data_0) w1 = builder.addInitializedInputTensor(weight_data_1) w2 = builder.addInitializedInputTensor(weight_data_2) o0 = builder.aiOnnx.matmul([i1, w0]) if enablePipelining: builder.virtualGraph(o0, 0) o1 = builder.aiOnnx.matmul([o0, w1]) if enablePipelining: builder.virtualGraph(o1, 1) o2 = builder.aiOnnx.matmul([o1, w2]) if enablePipelining: builder.virtualGraph(o2, 2) o2l1 = builder.aiGraphcore.l1loss([o2], 0.1) if enablePipelining: builder.virtualGraph(o2l1, 2) proto = builder.getModelProto() anchorId = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT" # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow(bps, [anchorId]) optimizer = popart.SGD({"defaultLearningRate": (1.0, False)}) opts = popart.SessionOptions() if enablePipelining: opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.enablePipelining = enablePipelining numIPUs = 1 if enablePipelining: numIPUs = 3 session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, loss=o2l1, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) session.prepareDevice() anchors = session.initAnchorArrays() inputs = {i1: input_data} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() # run 2 steps, changing the optimizer halfway through result = [] session.run(stepio) result.append(np.copy(anchors[anchorId])) session.updateOptimizerFromHost( popart.SGD({"defaultLearningRate": (0.5, False)})) session.run(stepio) result.append(np.copy(anchors[anchorId])) return result
def runTest(forceAddOutOfPlace, pipelineRecomputation): """ Test of pipelining with dropout, recomputation, graph replication, gradient accumulation """ #Has dependencies on T12562. T12976, T13098 for full support seed = 1015 npr.seed(seed) torch.manual_seed(seed) #L1 loss value lambda1 = 1.0 #optimizer parameters defaultLearningRate0 = 0.001 defaultMomentum0 = 0.01 defaultDampening0 = 0.5 lossScaling0 = 10.0 defaultVelocityScaling0 = 0.15 defaultWeightDecay0 = 0.01 # tensor dimensions and replications height = 6 batchesPerStep = 5 sampleShape = [height, height] accumulationFactor = 4 samplesPerBatch = 48 divvyFactor = replicationFactor * accumulationFactor if (samplesPerBatch % divvyFactor != 0): raise RuntimeError("Invalid divvy factor") samplesPerMicroBatch = samplesPerBatch // divvyFactor stepDataShape = [batchesPerStep, samplesPerBatch, height, height] microBatchShape = [samplesPerMicroBatch, height, height] stepDataInfo = popart.TensorInfo("FLOAT", stepDataShape) microBatchInfo = popart.TensorInfo("FLOAT", microBatchShape) #initial weight and input values w0vals = np.array(npr.randn(height, height), dtype=np.float32) w1vals = np.array(npr.randn(height, height), dtype=np.float32) w2vals = np.array(npr.randn(height, height), dtype=np.float32) inputVals = np.array(npr.randn(*stepDataShape), dtype=np.float32) # Build the ONNX Model builder = popart.Builder() input0 = builder.addInputTensor(microBatchInfo) w0 = builder.addInitializedInputTensor(w0vals) w1 = builder.addInitializedInputTensor(w1vals) w2 = builder.addInitializedInputTensor(w2vals) scaleFactor = 1. / np.sqrt(height + 0.) # Model: # # input w0 w1 # \ | / # matmul - scale -> dropout -> matul # \ | # | scale # | | # | dropout # | /\ # add -------<---<----<---- \ # | | # dropout scale by 2 # | | # = = = | = = = = = IPU barrier = = =|= = = = = = # | | # | w2 | # | / | # matmul / # | / # scale / # | / # dropout / # | | # ------->---->---->---> add -> L1 loss (lambda 2) with builder.virtualGraph(0): mm0 = builder.aiOnnx.matmul([input0, w0]) scale0 = builder.aiGraphcore.scale([mm0], scaleFactor) ratio0 = 0.35 [dropout0, mask0] = builder.aiOnnx.dropout([scale0], num_outputs=2, ratio=ratio0) mm1 = builder.aiOnnx.matmul([dropout0, w1]) scale1 = builder.aiGraphcore.scale([mm1], scaleFactor) ratio1 = 0.5 [dropout1, mask1] = builder.aiOnnx.dropout([scale1], num_outputs=2, ratio=ratio1) dropout1 = builder.aiGraphcore.scale([dropout1], 2.0) skipOut = builder.aiOnnx.add([mm0, dropout1]) # See resolved task T13137 if forceAddOutOfPlace: builder.setInplacePreferences(skipOut, {"AddRhsInplace": -1.0}) ratioSkip = 0.6 [dropoutSkip, maskSkip] = builder.aiOnnx.dropout([skipOut], num_outputs=2, ratio=ratioSkip) # see T13142: we do this so that the recomputation does not modify the anchors mask0 = builder.aiOnnx.identity([mask0]) mask1 = builder.aiOnnx.identity([mask1]) maskSkip = builder.aiOnnx.identity([maskSkip]) with builder.virtualGraph(1): mm2 = builder.aiOnnx.matmul([dropoutSkip, w2]) scale2 = builder.aiGraphcore.scale([mm2], scaleFactor) ratio2 = 0.7 [dropout2, mask2] = builder.aiOnnx.dropout([scale2], num_outputs=2, ratio=ratio2) out = builder.aiOnnx.add([dropout2, dropout1]) l1 = builder.aiGraphcore.l1loss([out], lambda1, reduction=popart.ReductionType.Sum) # see T13142: we do this so that the recomputation does not modify the anchors mask2 = builder.aiOnnx.identity([mask2]) anchors = { mask0: popart.AnchorReturnType("All"), mask1: popart.AnchorReturnType("All"), mask2: popart.AnchorReturnType("All"), maskSkip: popart.AnchorReturnType("All"), } dataFlow = popart.DataFlow(batchesPerStep, anchors) device = tu.create_test_device(numIpus=nIPUs) assert device userOptions = popart.SessionOptions() # This requires T12562 to be solved before enabling (TODO) userOptions.enableOutlining = False userOptions.enablePipelining = True userOptions.enableGradientAccumulation = True userOptions.accumulationFactor = accumulationFactor if pipelineRecomputation: userOptions.autoRecomputation = popart.RecomputationType.Pipeline if (replicationFactor > 1): userOptions.enableReplicatedGraphs = True userOptions.replicatedGraphCount = replicationFactor userOptions.virtualGraphMode = popart.VirtualGraphMode.Manual # TODO https://phabricator.sourcevertex.net/T14035 userOptions.enablePrefetchDatastreams = False # passes: userOptions.engineOptions = {"exchange.streamBufferOverlap": "any"} # fails: # userOptions.engineOptions = {"exchange.streamBufferOverlap" : "hostRearrangeOnly"} patterns = popart.Patterns() patterns.InPlace = True session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.SGD({ "defaultLearningRate": (defaultLearningRate0, False), "defaultMomentum": (defaultMomentum0, False), "defaultDampening": (defaultDampening0, False), "defaultVelocityScaling": (defaultVelocityScaling0, False), "lossScaling": (lossScaling0, True), "defaultWeightDecay": (defaultWeightDecay0, True) }), loss=l1, patterns=patterns, userOptions=userOptions, deviceInfo=device) anchorArrays = session.initAnchorArrays() session.prepareDevice() session.setRandomSeed(7) session.weightsFromHost() stepio = popart.PyStepIO({input0: inputVals}, anchorArrays) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w1R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) w2R = np.array(-777.0 * np.ones(sampleShape), dtype=np.float32) weightsRead = popart.PyWeightsIO({w0: w0R, w1: w1R, w2: w2R}) session.readWeights(weightsRead) class Net(nn.Module): def __init__(self): super(Net, self).__init__() # merge replication, accumulation flattenedShape = [anchorArrays[mask0].shape[0], -1, height, height] self.w0 = torch.nn.Parameter(torch.from_numpy(w0vals.copy())) self.mask0 = torch.from_numpy( anchorArrays[mask0].reshape(flattenedShape)) self.w1 = torch.nn.Parameter(torch.from_numpy(w1vals.copy())) self.mask1 = torch.from_numpy( anchorArrays[mask1].reshape(flattenedShape)) self.maskSkip = torch.from_numpy( anchorArrays[maskSkip].reshape(flattenedShape)) self.w2 = torch.nn.Parameter(torch.from_numpy(w2vals.copy())) self.mask2 = torch.from_numpy( anchorArrays[mask2].reshape(flattenedShape)) def forward(self, x, i): mm0 = torch.matmul(x, self.w0) dr0 = mm0 * scaleFactor * self.mask0[i].type( torch.FloatTensor) / (1 - ratio0) mm1 = torch.matmul(dr0, self.w1) dr1 = mm1 * scaleFactor * self.mask1[i].type( torch.FloatTensor) / (1 - ratio1) dr1 = 2 * dr1 drSkip = (dr1 + mm0) * self.maskSkip[i].type( torch.FloatTensor) / (1 - ratioSkip) mm2 = torch.matmul(drSkip, self.w2) dr2 = mm2 * scaleFactor * self.mask2[i].type( torch.FloatTensor) / (1 - ratio2) out = dr1 + dr2 return out net = Net() optimizer = optim.SGD(net.parameters(), lr=defaultLearningRate0, momentum=defaultMomentum0, dampening=defaultDampening0, weight_decay=defaultWeightDecay0) # caveat : alternative work-around for TODO T13098 for group in optimizer.param_groups: for p in group['params']: param_state = optimizer.state[p] param_state['momentum_buffer'] = p.data * 0 for i in range(batchesPerStep): out = net(torch.from_numpy(inputVals[i]), i) loss = lambda1 * torch.sum(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() delta0 = np.sum(np.abs(net.w0.detach().numpy() - w0vals)) delta1 = np.sum(np.abs(net.w1.detach().numpy() - w1vals)) delta2 = np.sum(np.abs(net.w2.detach().numpy() - w2vals)) print("pytorch baseline") print("Total moved by w0: ", delta0) print("Total moved by w1: ", delta1) print("Total moved by w2: ", delta2) error0 = np.sum(np.abs(w0R - net.w0.detach().numpy())) / delta0 error1 = np.sum(np.abs(w1R - net.w1.detach().numpy())) / delta1 error2 = np.sum(np.abs(w2R - net.w2.detach().numpy())) / delta2 print("without pipelining") print("Total moved by w0: ", np.sum(np.abs(w0R - w0vals))) print("Total moved by w1: ", np.sum(np.abs(w1R - w1vals))) print("Total moved by w2: ", np.sum(np.abs(w2R - w2vals))) print("l1 error for w0: ", error0) print("l1 error for w1: ", error1) print("l1 error for w2: ", error2) assert (error0 < 1e-5) assert (error1 < 1e-5) assert (error2 < 1e-5)
# In this example: # the l1 loss "out", # and the input tensor "image0" anchors = { "out": popart.AnchorReturnType("EveryN", 2), "image0": popart.AnchorReturnType("All") } dataFlow = popart.DataFlow(batchesPerStep, anchors) # PopART is non-dynamic. All input Tensor shapes and # types must be fed into the Session constructor. # In this example there is 1 streamed input, image0. inputShapeInfo = popart.InputShapeInfo() inputShapeInfo.add("image0", popart.TensorInfo("FLOAT", [batchSize, nChans, 32, 32])) inNames = ["image0"] # outNames: not the same as anchors, # these are the outputs of the onnx # model. In training these are the # this is the scalar loss on which # 'backward' is called outNames = ["out"] #cifar training data loader : at index 0 : image, at index 1 : label. cifarInIndices = {"image0": 0} class Module0(torch.nn.Module):
def fwd_graph(popart_model, torch_model, mode, mapping=None, transform=None, replication_factor=1, replicated_tensor_sharding = False): # ------------------- PopART -------------------- config = popart_model.config builder = popart_model.builder sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = builder.addInputTensor(sequence_info) positions = builder.addInputTensor(sequence_info) segments = builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), positions: np.random.randint( 0, config.sequence_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), segments: np.random.randint( 0, 2, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32) } user_options = {} if mode == ExecutionMode.PHASED: user_options = { "batchSerializationFactor": 1, "executionPhases": popart_model.total_execution_phases } output = popart_model(indices, positions, segments) ipus = 2 else: output = popart_model.build_graph(indices, positions, segments) ipus = popart_model.total_ipus proto = builder.getModelProto() outputs, _ = run_py(proto, data, output, user_options=user_options, execution_mode=mode, replication_factor=replication_factor, replicated_tensor_sharding=replicated_tensor_sharding, ipus=ipus) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = { "input_ids": data[indices].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "position_ids": data[positions].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "token_type_ids": data[segments].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32) } torch_to_onnx = get_mapping(config, init=mapping) transform_weights = get_transform(config, init=transform) # ------------------- PyTorch ------------------------- # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, torch_to_onnx, transform_weights) torch_outputs = run_fwd_model(inputs, torch_model) check_tensors(torch_outputs, outputs)
def test_train(tmpdir, capfd): filt_data = np.array([1., 2., 1., 2.], dtype=np.float32) filt_data = np.reshape(filt_data, [1, 1, 2, 2]) input_data = np.array([1., 2., 3., 4.], dtype=np.float32) input_data = np.reshape(input_data, [1, 1, 2, 2]) builder = popart.Builder() shape = popart.TensorInfo("FLOAT", input_data.shape) i1 = builder.addInputTensor(shape, "data") i2 = builder.addInitializedInputTensor(filt_data, "filter") # both i2 and d__i2 will be printed p1 = builder.aiGraphcore.printtensor([i2]) c1 = builder.aiOnnx.conv([i1, p1], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[2, 2]) # c1 will be printed, but d__c1 will not o = builder.aiGraphcore.printtensor([c1], print_gradient=0) l1 = builder.aiGraphcore.l1loss([o], 0.1, reduction=popart.ReductionType.Sum) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.enableOutlining = False opts.enableOutliningCopyCostPruning = False session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, optimizer=popart.ConstSGD(0.1), loss=l1, deviceInfo=tu.create_test_device()) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {i1: input_data} stepio = popart.PyStepIO(inputs, anchors) capfd.readouterr() session.run(stepio) captured = capfd.readouterr() output = captured.err # Remove ESC characters output = re.sub(chr(27), '', output) # Remove termcolor sequences output = re.sub('\[\d\dm', '', output) # Remove popart log lines output = re.sub('\[\d\d\d\d-\d\d-\d\d .*?\n', '', output) # remove all whitespace output = re.sub('\s+', '', output) pattern = 'name:{{{{float,float},{float,float}}}}' pattern = re.sub('name', r'[\\w:]+', pattern) pattern = re.sub('float', r'\\d(?:\\.\\d+)?', pattern) matches = re.findall(pattern, output) d__i2 = popart.reservedGradientPrefix() + i2 assert len(matches) == 3 assert matches[0] == i2 + ":{{{{1,2},{1,2}}}}" assert matches[1] == c1 + ":{{{{2,2},{6,4}}}}" assert matches[2] == d__i2 + ":{{{{0.4,0.3},{0.2,0.1}}}}"
def bwd_graph(popart_model, torch_model, mode, popart_loss_fn, torch_loss_fn, mapping=None, transform=None, replication_factor=1, replicated_tensor_sharding=False, opt_type="SGD"): np.random.seed(1984) random.seed(1984) torch.manual_seed(1984) # ------------------- PopART -------------------- config = popart_model.config builder = popart_model.builder sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = builder.addInputTensor(sequence_info) positions = builder.addInputTensor(sequence_info) segments = builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), positions: np.random.randint( 0, config.sequence_length, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32), segments: np.random.randint( 0, 2, (replication_factor, config.micro_batch_size * config.sequence_length)).astype(np.uint32) } num_reps = 5 user_options = {} if mode == ExecutionMode.PHASED: user_options = { "batchSerializationFactor": 1, "executionPhases": popart_model.total_execution_phases } output = popart_model(indices, positions, segments) ipus = 2 else: output = popart_model.build_graph(indices, positions, segments) ipus = popart_model.total_ipus loss = popart_loss_fn(output) proto = builder.getModelProto() if opt_type == "SGD": optimizer = popart.ConstSGD(1e-3) elif opt_type == "LAMB": optMap = { "defaultLearningRate": (1e-3, True), "defaultBeta1": (0.9, True), "defaultBeta2": (0.999, True), "defaultWeightDecay": (0.0, True), "maxWeightNorm": (10.0, True), "defaultEps": (1e-8, True), "lossScaling": (1.0, True), } optimizer = popart.Adam(optMap, mode=popart.AdamMode.Lamb) elif opt_type == "LAMB_NO_BIAS": optMap = { "defaultLearningRate": (1, False), "defaultBeta1": (0, False), "defaultBeta2": (0, False), "defaultWeightDecay": (0.0, False), "defaultEps": (1e-8, False), "lossScaling": (1.0, False), } optimizer = popart.Adam(optMap, mode=popart.AdamMode.LambNoBias) else: raise ValueError(f"Unknown opt_type={opt_type}") patterns = popart.Patterns() if mode == ExecutionMode.PHASED: patterns.enablePattern("TiedGatherPattern", False) patterns.enablePattern("SparseAccumulatePattern", False) outputs, post_proto = run_py(proto, data, output, loss=loss, optimizer=optimizer, user_options=user_options, execution_mode=mode, patterns=patterns, replication_factor=replication_factor, replicated_tensor_sharding=replicated_tensor_sharding, ipus=ipus, num_reps=num_reps) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = { "input_ids": data[indices].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "position_ids": data[positions].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32), "token_type_ids": data[segments].reshape(replication_factor * config.micro_batch_size, config.sequence_length).astype(np.int32) } torch_to_onnx = get_mapping(config, init=mapping) transform_weights = get_transform(config, init=transform) # ------------------- PyTorch ------------------------- # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, torch_to_onnx, transform_weights) if opt_type == "SGD": optim = torch.optim.SGD(torch_model.parameters(), 1e-3, weight_decay=0.0, momentum=0.0) elif opt_type == "LAMB": optim = torch_lamb.Lamb(torch_model.parameters(), lr=1e-3, weight_decay=0.0, biasCorrection=True) for _ in range(num_reps): torch_outputs = torch_model( **{k: torch.from_numpy(t).long() for k, t in inputs.items()}) torch_loss = torch_loss_fn(torch_outputs) torch_loss.backward() optim.step() optim.zero_grad() check_tensors([output.detach().numpy() for output in torch_outputs], outputs, margin=1.5e-06) check_model(torch_model, post_proto, torch_to_onnx, transform_weights, margin=5e-5)
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, doProfiling=False, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 2 shape_d0 = [batchSize, 2, 4, 4] shape_l0 = [batchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugPrefix="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32]) out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm") nll = builder.aiGraphcore.nllloss([out, l0]) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining if doSharding is False: numIPUs = 1 else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIPUs = 3 builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 1) builder.virtualGraph(c0, 1) builder.virtualGraph(r0, 2) builder.virtualGraph(out, 2) builder.virtualGraph(nll, 2) if doTraining is True: session = popart.TrainingSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20)) else: session = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=popart.DataFlow(batchesPerStep, anchor_map), userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20)) if doDevicex is False: return None anchors = session.initAnchorArrays() session.prepareDevice() if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) shape_l0.insert(0, batchesPerStep) data = np.random.uniform(low=-10.0, high=10.0, size=shape_d0).astype(np.float32) classes = np.prod(shape_d0) / (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= import argparse import ctypes import os import numpy as np import popart #Define a function to build and run the rsqrt graph with #specified input tensor data and alpha value def build_and_run_graph(input_data, run_on_ipu) : builder = popart.Builder() input_len = len(input_data) input_tensor = builder.addInputTensor(popart.TensorInfo("FLOAT", [input_len])) print("Shape of {}: {}".format(input_tensor, builder.getTensorShape(input_tensor))) output_tensor = builder.customOp(opName = "Erf", opVersion = 1, domain = "ai.graphcore", inputs =[input_tensor], attributes = {})[0] print("Inputs: {}".format(builder.getInputTensorIds())) print("Outputs: {}".format(builder.getOutputTensorIds())) print("Values: {}".format(builder.getValueTensorIds())) print("Shape of {}: {}".format(output_tensor, builder.getTensorShape(output_tensor))) builder.addOutputTensor(output_tensor) proto = builder.getModelProto() anchors = {output_tensor : popart.AnchorReturnType("FINAL") } dataFlow = popart.DataFlow(1, anchors) if run_on_ipu : device = popart.DeviceManager().acquireAvailableDevice(1) print("IPU hardware device acquired") else : device = popart.DeviceManager().createIpuModelDevice({}) print("Running on IPU Model") session = popart.InferenceSession(proto, dataFlow, device)