def run(transposed): bsize = 8 dsize = 10 builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [bsize, dsize, dsize])) if transposed: # Explicitly specify the batch dimension for init init = builder.aiGraphcore.init([dsize, dsize, bsize], popart.DataType.FLOAT, popart.InitType.Zero, 2) else: init = builder.aiGraphcore.init([bsize, dsize, dsize], popart.DataType.FLOAT, popart.InitType.Zero, 0) def add_layer(in_id): w = builder.addInitializedInputTensor( np.ones([dsize, dsize], np.float32)) if transposed: inputs = [w, in_id] else: inputs = [in_id, w] matmul_id = builder.aiOnnx.matmul(inputs) return matmul_id if transposed: ip_t = builder.aiOnnx.transpose([ip]) else: ip_t = ip m1 = add_layer(ip_t) init = builder.aiOnnx.add([init, m1]) m2 = add_layer(m1) init = builder.aiOnnx.add([init, m2]) m3 = add_layer(m2) init = builder.aiOnnx.add([init, m3]) out = builder.aiGraphcore.l1loss([init], 0.1) builder.addOutputTensor(out) device = tu.create_test_device(1) dfAnchors = {out: popart.AnchorReturnType("All")} opts = popart.SessionOptions() opts.enableOutlining = True opts.batchSerializationSettings.factor = 4 proto = builder.getModelProto() session = popart.InferenceSession( fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), patterns=popart.Patterns(popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() ip_data = np.ones((bsize, dsize, dsize), dtype=np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio)
def _run_impl(torchWriter, patterns, outputdir, cifarInIndices, device, device_hw_id, mode, syntheticData, transformations, epochs, printAnchorArrays): runIds = [-1] + [ int(x.split("runId")[1].split("_")[0]) for x in os.listdir(outputdir) if "runId" in x ] baseId = 1 + max(runIds) def getFnModel(framework, epoch): return os.path.join( outputdir, "runId%d_%sModel_epoch%s.onnx" % (baseId, framework, epoch)) def getFnPopArt(epoch): return getFnModel("PopArt", epoch) def getFnTorch(epoch): return getFnModel("Torch", epoch) def getFnModel0(): return os.path.join(outputdir, "runId%d_model0.onnx" % (baseId, )) dataFlow = torchWriter.dataFlow inputShapeInfo = torchWriter.inputShapeInfo validModes = ["infer", "train"] if mode not in validModes: raise Exception("mode must be one of " + str(validModes)) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # determine what the data directory is datadir = "unset" dir_path = os.path.dirname(os.path.realpath(__file__)) path_c10datadir = os.path.join(dir_path, "c10datadir.py") if os.path.exists(path_c10datadir): import c10datadir datadir = c10datadir.c10datadir else: tmpdir = tempfile.gettempdir() datadir = os.path.abspath(os.path.join(tmpdir, 'cifar10data')) print("Using datadir=%s" % (datadir)) if (not os.path.exists(datadir)): print( "Specified datadir %s does not exist. Consider making it here with os.mkdir(datadir)" % (datadir, )) print("c10driver: getting data from", datadir) trainset = datasets.CIFAR10(root=datadir, train=True, download=False, transform=transform) fnModel0 = getFnModel0() # write ONNX Model to file torchWriter.saveModel(fnModel=fnModel0) stepLoader = torch.utils.data.DataLoader( trainset, # the amount of data loaded for each step. # note this is not the batch size, it's the "step" size # (samples per step) batch_size=torchWriter.samplesPerBatch * dataFlow.batchesPerStep(), #non-random data loading shuffle=False, num_workers=0) deviceManager = popart.DeviceManager() # Create a CPU device if device == "cpu": device = deviceManager.createCpuDevice() # Create an IPU Model device elif device == "ipu_model": options = {"compileIPUCode": True, 'numIPUs': 1, 'tilesPerIPU': 4} device = deviceManager.createIpuModelDevice(options) # Create an Simulator elif device == "sim": options = {"numIpus": 1, "tilesPerIPU": 4} device = deviceManager.createSimDevice(options) # Get a Hardware Device elif device == "hw": # Get a hardware device that meets the reqirements, # may throw if none are available. # Will attach to the device if device_hw_id: device = deviceManager.acquireDeviceById(device_hw_id) else: device = tu.acquire_ipu() # Enumerate available devices print("Enumerating devices") print("-------------------------------------") for idx, d in enumerate(deviceManager.enumerateDevices()): print('{0}. {1}'.format(idx, d)) print("") opts = popart.SessionOptions() opts.logDir = outputdir if syntheticData == True: opts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal modelProtoX = fnModel0 if transformations: gc = popart.GraphTransformer(fnModel0) for transformation in transformations: print("Running %s transformation pass" % (transformation, )) if transformation == "removeUnusedInputs": gc.removeUnusedInputs() elif transformation == "prepareNodesForTraining": gc.prepareNodesForTraining() else: raise RuntimeError("Unrecognised transformation %s" % (transformation, )) modelProtoX = gc.getModelProto() # Reads ONNX model from file and creates backwards graph, # performs Ir optimisations if mode == 'infer': session = popart.InferenceSession(fnModel=modelProtoX, inputShapeInfo=inputShapeInfo, dataFlow=dataFlow, patterns=patterns, userOptions=opts, deviceInfo=device) else: if len(torchWriter.outNames) != 1: raise RuntimeError("Expecting single scalar loss tensor") # Append output with an identity loss, to reduce to scalar if # necessary bder = popart.Builder(modelProtoX) loss = bder.aiGraphcore.identityloss( [torchWriter.outNames[0]], reduction=popart.ReductionType.Sum) session = popart.TrainingSession(fnModel=bder.getModelProto(), inputShapeInfo=inputShapeInfo, dataFlow=dataFlow, loss=loss, optimizer=torchWriter.optimizer, patterns=patterns, userOptions=opts, deviceInfo=device) # get the tensor info for the anchors anchorArrays = session.initAnchorArrays() allDotPrefixes = [x[0:-4] for x in os.listdir(outputdir) if ".dot" in x] print("Will generate graph pdfs for all of:") print(allDotPrefixes) import subprocess # set generateFromDots to True to # generate pdf figures of the Ir. It # requires the 'dot' program generateFromDots = False if generateFromDots: for name in allDotPrefixes: dotfile = os.path.join(outputdir, "%s.dot" % (name, )) outputfile = os.path.join(outputdir, "%s.pdf" % (name, )) log = subprocess.call( ["dot", "-T", "pdf", "-o", outputfile, dotfile]) print("Exit status on `%s' was: %s" % (name, log)) print("Setting device to IPU, and preparing it") session.prepareDevice() if mode == "train": print("Writing weights to device") session.weightsFromHost() print("Writing Optimizer tensors to device, if there are any") def addStepDimension(data, batchesPerStep): if batchesPerStep == 1: return data else: dataShape = np.array(np.shape(data)) dataShape[0] //= batchesPerStep dataShape = np.insert(dataShape, 0, batchesPerStep) return np.reshape(data, dataShape) def reportTensorError(tensorInd, result): reportStr = str(tensorInd) + " :\n" reportStr += " |pA - tA|^2 / (|pA||tA| + 1e-8) = " + str( result) + "\n" return reportStr def getAnchorTensor(tId, anchorArrays): assertStr = "Tensor" + tId + " must be specified as an anchor" assert (tId in anchorArrays.keys()), assertStr return anchorArrays[tId] def subsampleBatches(array, refShape): arrayShape = np.shape(array) # Every Nth batch if len(arrayShape) == len(refShape): n = arrayShape[0] // refShape[0] return array[n - 1::n] # Last batch only else: return array[-1] def getTensorError(tA, pA): # pA, tA are corresponding tensors from two models pA_shape = np.shape(pA) tA_shape = np.shape(tA) assert (pA_shape == tA_shape), "Arrays must be same shape" ss_err = np.sum((np.array(pA) - np.array(tA))**2) ss_pA = np.sum(np.array(pA)**2) ss_tA = np.sum(np.array(tA)**2) return ss_err / (math.sqrt(ss_pA * ss_tA) + 1.0e-8) def checkResult(result, margin): if np.isnan(result): raise TestFailureError(str(result) + " is NaN") elif (result > margin): raise TestFailureError( str(result) + " is greater than " + str(margin)) margin = 5.0e-7 numReports = [] for epoch in range(epochs): # loop over the dataset multiple times print("Epoch is %d" % (epoch, )) stepData = next(iter(stepLoader)) # Form the input map for one step's worth of data. # Note: data from the torch DataLoader has shape: # [stepSize * batchSize, sampleShape] # whereas Popart expects input data of the shape: # [stepSize, batchSize, sampleShape] # so we reshape the input array before passing to the stepio inputs = {} for tenId in cifarInIndices.keys(): inputs[tenId] = \ addStepDimension(stepData[cifarInIndices[tenId]].numpy(), session.dataFlow.batchesPerStep()) if mode == "train": # take batchesPerStep passes (1 step), Torch torchWriter.train(inputs) # take batchesPerStep passes (1 step), PopArt pystepio = popart.PyStepIO(inputs, anchorArrays) session.run(pystepio) if printAnchorArrays: print( "\nAnchor arrays (being printed as printAnchorArrays==True):" ) for name in anchorArrays.keys(): arr = anchorArrays[name] print("\nAnchored Array Name=", name, " and Size=", arr.size) if (arr.size < 10): print("\nArray (of size < 10) values are") print(arr) if len(arr.shape) > 1: for i, slice0 in enumerate(arr): print("Sum along axis %d is Sum=%.15f" % (i, slice0.sum())) print("Total Sum is %.15f" % (arr.sum())) # write models to file fnTorchModel = getFnTorch(epoch) fnPopArtModel = getFnPopArt(epoch) torchWriter.saveModel(fnTorchModel) session.modelToHost(fnPopArtModel) print("Writing models to " + fnTorchModel + " and " + fnPopArtModel) # Compare parameters from updated Onnx models print("Obtaining popart NumericsReport, A: Torch, B: Popart.") if epoch is 0: nr = popart.NumericsReport(fnModel0, fnTorchModel, fnModel0, fnPopArtModel) else: nr = popart.NumericsReport(getFnTorch(epoch - 1), fnTorchModel, getFnPopArt(epoch - 1), fnPopArtModel) print(nr.fullReport()) # One relative error calculated per weight tensor for tId, relerror in nr.getRelativeErrors().items(): checkResult(relerror, margin) elif mode == "infer": # take batchesPerStep passes (1 step), Torch # returns map of outputs for each sample # Note: already are of dimension matching the # anchors torchOutputs = torchWriter.infer(inputs) # take batchesPerStep passes (1 step), PopArt pystepio = popart.PyStepIO(inputs, anchorArrays) session.run(pystepio) # Compare torch outputs tensors with popart output from # anchor tensor maps for nInd, outName in enumerate(torchWriter.outNames): # Torch outputs returned for all samples, whereas # anchors are returned as specified by the user. # Subsample torch outputs to match dimensions torchOuput = subsampleBatches(torchOutputs[outName], np.shape(anchorArrays[outName])) result = getTensorError(torchOuput, anchorArrays[outName]) print(reportTensorError(nInd, result)) checkResult(result, margin) return anchorArrays
def get_model(input_shape: List[int], weight_array: np.array, batches_per_step: int, replication_factor: int, batch_size: int, channels: int, data_len: int, synthetic_data: bool, buffer_streams: bool) -> Tuple: """Get a simple model for comparison with buffer streams on and off. Adapted from prefetch_test.py as we require to test the validity of streams here as well. Args: batches_per_step (int): Batches to run per step replication_factor (int): Replicas to run batch_size (int): Number of samples per model run channels (int): Number of channels e.g. RGB = 3 data_len (int): Data size synthetic_data (bool): Use synthetic data (zeros in this case) buffer_streams (bool): The test option: whether to create ops before the stream in order to schedule data loading as part of graph scheduling. See T29603. Returns: Tuple: session, anchors, input_shape, label_shape required to run the model """ micro_batch_size = batch_size // (replication_factor) builder = popart.Builder() data_shape = popart.TensorInfo("FLOAT", input_shape) lbl_shape = popart.TensorInfo("INT32", [micro_batch_size]) w = builder.addInitializedInputTensor(weight_array) ip = builder.addInputTensor(data_shape, "main_input_123") lb = builder.addInputTensor(lbl_shape, "label_input_456") a = builder.aiOnnx.matmul([ip, w]) o = builder.reshape_const( builder.aiOnnx, [a], [micro_batch_size, channels * data_len * data_len]) relu = builder.aiOnnx.relu([o]) sm = builder.aiOnnx.softmax([relu], axis=0, debugContext="output") builder.addOutputTensor(sm) o = builder.aiGraphcore.nllloss([sm, lb], reduction=popart.ReductionType.Mean) art = popart.AnchorReturnType("All") data_flow = popart.DataFlow(batches_per_step, { ip: art, lb: art, o: art, sm: art, a: art, relu: art }) opts = popart.SessionOptions() opts.useHostCopyOps = buffer_streams # TODO: Fix outlining opts.enableOutlining = False ipus = 1 if replication_factor > 1: opts.replicatedGraphCount = replication_factor opts.enableReplicatedGraphs = True ipus *= replication_factor device = tu.create_test_device(ipus) assert device patterns = popart.Patterns(popart.PatternsLevel.Minimal).enablePattern( "MatMulLhsGradOp", True).enablePattern("MatMulRhsGradOp", True) patterns.InPlace = False if synthetic_data: opts.syntheticDataMode = popart.SyntheticDataMode.Zeros session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=data_flow, loss=o, optimizer=popart.ConstSGD(LR), userOptions=opts, deviceInfo=device, patterns=patterns) session.setRandomSeed(0) session.prepareDevice() label_shape = [micro_batch_size] if replication_factor > 1: input_shape = [replication_factor] + input_shape label_shape = [replication_factor] + label_shape if batches_per_step > 1: input_shape = [batches_per_step] + input_shape label_shape = [batches_per_step] + label_shape anchors = session.initAnchorArrays() return session, anchors, label_shape
def run_test(index, options): builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) mask = builder.addInputTensor(popart.TensorInfo("FLOAT", mask_shape), "mask") x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") anchors = {} x = x_in for i in range(options['numLayers']): qkv = builder.addInitializedInputTensor(qkv_data, f"qkv_{i}") anchors[popart.reservedGradientPrefix() + qkv] = popart.AnchorReturnType("All") vgid = (i % 2) if options['pingPong'] else i with builder.virtualGraph(vgid), builder.pingPongPhase(i): x = builder.aiOnnx.matmul([x, qkv]) x = attention_onnx(builder, x, mask, batch_size, sequence_length, hidden_size, attention_heads, qkv_length) vgid = ((options['numLayers'] - 1) % 2) if options['pingPong'] else options['numLayers'] - 1 with builder.virtualGraph(vgid): l1 = builder.aiGraphcore.l1loss([x], 0.1) proto = builder.getModelProto() anchors[x] = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(batches_per_step, anchors) opts = popart.SessionOptions() opts.pingPongPhases = options['numLayers'] if options["pingPong"] else 0 opts.enableOutlining = options["outlining"] # PingPong currently does its own recompute annotations opts.autoRecomputation = (popart.RecomputationType.Standard if options["explicitRecomputation"] else popart.RecomputationType.NoRecompute) opts.outlineThreshold = -np.inf opts.enableOutliningCopyCostPruning = False opts.virtualGraphMode = (popart.VirtualGraphMode.PingPong if options["pingPong"] else popart.VirtualGraphMode.Manual) opts.explicitRecomputation = options["explicitRecomputation"] opts.aliasZeroCopy = options["aliasZeroCopy"] opts.batchSerializationFactor = options["batchSerialize"] pat = popart.Patterns(popart.PatternsLevel.Default) device = tu.create_test_device(2 if options["pingPong"] else 4, pattern=popart.SyncPattern.Full) session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.ConstSGD(0.1), patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {x_in: input_data, mask: mask_data} stepio = popart.PyStepIO(inputs, anchors) for __ in range(10): session.run(stepio) session.modelToHost(str(tmpdir / f"pingpong_attention_{index}.onnx")) return anchors
def bert_session_options(args, model): options = popart.SessionOptions() options.enableVirtualGraphs = True options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enableFloatingPointChecks = args.floating_point_exceptions options.enableStochasticRounding = args.stochastic_rounding options.enableGroupedMatmuls = False options.enableOutlining = not args.no_outlining # Increasing the outlineThreshold prevents creating subgraphs of cheap Ops # such as add or reshapeInplace. # Instead only reusing ops with a highSubgraphValue such as matmul or normalisation. options.outlineThreshold = 10.0 if args.execution_mode == "PIPELINE": options.enablePipelining = True options.autoRecomputation = popart.RecomputationType.Pipeline if args.gradient_accumulation_factor > 1: options.enableGradientAccumulation = True options.accumulationFactor = args.gradient_accumulation_factor if args.replication_factor > 1: options.enableReplicatedGraphs = True options.replicatedGraphCount = args.replication_factor if args.engine_cache is not None: options.enableEngineCaching = True options.cachePath = args.engine_cache if args.gc_profile: options.reportOptions = { "showVarStorage": "true", "showPerIpuMemoryUsage": "true", "showExecutionSteps": "true" } options.instrumentWithHardwareCycleCounter = args.report_hw_cycle_count # Addition of momentum tensors causes merged copies to exceed max # host translation table entries during the weightsFromHost program. # With the addition of disableGradAccumulationTensorStreams no copy merging # is needed but may be needed later when gradAccumulationTensorStreams are re-enabled # FIXME when T11642 is resolved. options.disableGradAccumulationTensorStreams = True if args.max_copy_merge_size == -1: logger.debug(f"No copy merge size limit applied") else: logger.warning( f"Workaround for T11642: copy merge size limit set to {args.max_copy_merge_size}" ) options.engineOptions = { "opt.maxCopyMergeSize": str(args.max_copy_merge_size), } # Adding {"fullyConnectedPass", "TRAINING_BWD"} to some matmuls causes large # transposes before operations. # WARNING: This causes SQuAD 384 12-layer to go OOM if args.disable_fully_connected_pass: if args.task == "SQUAD" and args.sequence_length == 384: logger.warning( f"Fully connected pass has been disabled. This may cause SQuAD 384 12-layer to go OOM." ) options.enableFullyConnectedPass = False if args.inference and args.engine_cache is not None and not args.variable_weights_inference: logger.warn( "Using engine cache with constant weights. Checkpoint weights will be ignored. " "Use the `--variable-weights-inference` flag if checkpoint weights should be used." ) if args.variable_weights_inference: options.constantWeights = False return options
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[Union[popart.Loss, Iterable[popart.Loss]]] = None, optimizer: Optional[popart.Optimizer] = None, return_stats: bool = False, log_dir: Optional[str] = None, ipus: Optional[int] = None, batches_per_step: int = 1, user_options: Optional[Mapping[str, Any]] = None): outputs = make_tuple(outputs) if loss is not None: loss = make_tuple(loss) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.enableGroupedMatmuls = False options.enableStochasticRounding = False options.reportOptions = {"showVarStorage": "true"} if ipus is not None and ipus > 1: options.virtualGraphMode = popart.VirtualGraphMode.Manual else: ipus = 1 if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } for key, value in user_options.items(): setattr(options, key, value) if ipus is not None: options.enableVirtualGraphs = False else: ipus = 1 if return_stats: options.engineOptions = { "debug.allowOutOfMemory": "true", "debug.instrument": "true" } request_ipus = pow(2, math.ceil(math.log2(ipus))) device = popart.DeviceManager().acquireAvailableDevice(request_ipus) # The cycle estimates of MSR_OPS codelets have not been validated # so it is incorrect to use the IPU_MODEL. if device is None: raise Exception("Failed to acquire IPU.") print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFeed=data_flow, userOptions=options, losses=loss, optimizer=optimizer) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFeed=data_flow, userOptions=options) # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.PrepareDeviceException as e: if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir, exception=e) graph_report = json.loads(reports["graph"]) else: graph_report = json.loads(e.getGraphReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) raise e else: raise e print("Compilation complete") session.weightsFromHost() if optimizer is not None: session.optimizerFromHost() session.setRandomSeed(1984) anchors = session.initAnchorArrays() # Add a batches_per_step dimension if needed if batches_per_step > 1: data = { k: np.repeat(v[np.newaxis], batches_per_step, 0) for k, v in data.items() } stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() if return_stats: if log_dir: import gcprofile os.makedirs(log_dir, exist_ok=True) reports = gcprofile.save_popart_report(session, log_dir=log_dir) graph_report = json.loads(reports["graph"]) exec_report = json.loads(reports["execution"]) else: graph_report = json.loads(session.getGraphReport()) exec_report = json.loads(session.getExecutionReport()) max_tile_memory = max(graph_report["memory"]["byTile"]["total"]) total_memory = np.sum(graph_report["memory"]["byTile"]["total"]) cycles = exec_report["simulation"]["cycles"] return (anchors[output] for output in outputs ), post_proto, total_memory, max_tile_memory, cycles return (anchors[output] for output in outputs), post_proto
def train(opts, model_file, ckpt_file) -> None: """ Train MNIST model using command line args. Args: opts: The command line options model_file: Temporary file for holding the model ckpt_file: Temporary file for holding the weights """ if not opts.test_mode: max_value = NUM_TEST_SAMPLES // opts.batch_size if max_value < opts.batches_per_step: print( "(batches-per-step * batch-size) is larger than test set!\n" " Reduced batches-per-step to: {}\n".format(max_value) ) opts.batches_per_step = max_value # Construct MNIST data loaders train_loader = get_data_loader(opts, is_train=True) test_loader = get_data_loader(opts, is_train=False) print("Creating ONNX model.") data_in, output = create_model(opts.batch_size, model_file) print("Converting model.") proto, label_in, loss = convert_model( opts.batch_size, model_file.name, output ) # Describe how to run the model anchor_desc = { output: popart.AnchorReturnType("ALL"), loss: popart.AnchorReturnType("ALL"), } dataFlow = popart.DataFlow(opts.batches_per_step, anchor_desc) optimizer = popart.ConstSGD(0.01) # Options userOpts = popart.SessionOptions() # Ensure weight tensors in the validation model are not modified by the IR userOpts.constantWeights = False # If requested, setup synthetic data if opts.syn_data_type in ["random_normal", "zeros"]: print( "Running with Synthetic Data Type '{}'".format(opts.syn_data_type) ) if opts.syn_data_type == "random_normal": userOpts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal elif opts.syn_data_type == "zeros": userOpts.syntheticDataMode = popart.SyntheticDataMode.Zeros # Select a device deviceManager = popart.DeviceManager() if opts.simulation: print("Running using IPU MODEL") options = { "compileIPUCode": True, "numIPUs": 1, "tilesPerIPU": TILES_PER_IPU, } device = deviceManager.createIpuModelDevice(options) else: print("Running using Hardware") device = deviceManager.acquireAvailableDevice() if device is None: print("Failed to acquire IPU. Exiting.") return if opts.test_mode: print(" IPU IDs: {}".format(device.driverIds)) def init_session(proto, loss, dataFlow, userOpts, device, training, opts): # Create a session to compile and execute the graph if opts.test_mode: userOpts.instrumentWithHardwareCycleCounter = True if training: session = popart.TrainingSession( fnModel=proto, loss=loss, optimizer=optimizer, dataFlow=dataFlow, userOptions=userOpts, deviceInfo=device, ) else: session = popart.InferenceSession( fnModel=proto, dataFlow=dataFlow, userOptions=userOpts, deviceInfo=device, ) print( "Compiling the {} graph.".format( "training" if training else "validation" ) ) session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() Session = namedtuple("Session", ["session", "anchors"]) return Session(session, anchors) training = init_session(proto, loss, dataFlow, userOpts, device, True, opts) validation = init_session( proto, loss, dataFlow, userOpts, device, False, opts ) inputs_per_step = opts.batch_size * opts.batches_per_step for i in range(opts.epochs): # Training if i > 0: training.session.resetHostWeights(ckpt_file.name) training.session.weightsFromHost() for data, label in train_loader: if len(label) != inputs_per_step: continue data, label = preprocess_data(data, label) stepio = popart.PyStepIO( {data_in: data, label_in: label}, training.anchors ) if opts.test_mode == "training": start = time() training.session.run(stepio) if opts.test_mode == "training": duration = time() - start report_string = "{:<8.3} sec/itr.".format(duration) report_string += " " + iteration_report(opts, duration) print(report_string) print( "Hardware cycle count per 'run':", training.session.getCycleCount(), ) print("Total time: {}".format(duration)) # Evaluation aggregated_loss = 0 num_correct = 0 training.session.modelToHost(ckpt_file.name) validation.session.resetHostWeights(ckpt_file.name) validation.session.weightsFromHost() for data, label in test_loader: if len(label) != inputs_per_step: continue data, label = preprocess_data(data, label) stepio = popart.PyStepIO( {data_in: data, label_in: label}, validation.anchors ) if opts.test_mode == "inference": start = time() validation.session.run(stepio) if opts.test_mode == "inference": duration = time() - start report_string = "{:<8.3} sec/itr.".format(duration) report_string += " " + iteration_report(opts, duration) print(report_string) print( "Hardware cycle count per 'run':", validation.session.getCycleCount(), ) print("Total time: {}".format(duration)) aggregated_loss += np.mean(validation.anchors[loss]) results = np.argmax( validation.anchors[output].reshape( [inputs_per_step, NUM_CLASSES] ), 1, ) score = results == label.reshape([inputs_per_step]) num_correct += np.sum(score) aggregated_loss /= len(test_loader) accuracy = num_correct / len(test_loader.dataset) # Log statistics print("Epoch #{}".format(i)) print(" Loss={0:.4f}".format(aggregated_loss)) print(" Accuracy={0:.2f}%".format(accuracy * 100))
def run_test(mode=None, verify=None): builder = popart.Builder() x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") # We want a bwd pass that looks like: # # restore, op1, restore, op2, restore, op3 # # Where op1, op2 & op3 are gradient operations that # have implicit recompute inputs. with builder.virtualGraph(0), builder.pipelineStage(0): x = builder.aiOnnx.matmul([x_in, weight_1]) x = builder.checkpointOutput([x])[0] x = builder.aiOnnx.add([x, x]) # Gelu is a unary operation that takes the fwd input # activation. This satisfies our requirement above # of needing an implicit recompute input. x = builder.aiGraphcore.gelu([x]) x = builder.checkpointOutput([x])[0] x = builder.aiOnnx.add([x, x]) x = builder.aiGraphcore.gelu([x]) x = builder.checkpointOutput([x])[0] o = x with builder.virtualGraph(1), builder.pipelineStage(1): l1 = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [ o, popart.reservedGradientPrefix() + weight_1, ]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if mode is not None: opts.autoRecomputation = mode opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {x_in: input_data} stepio = popart.PyStepIO(inputs, anchors) for _ in range(10): session.run(stepio) if verify is not None: verify(session) return anchors
def test_final_stage_recompute_0(): np.random.seed(0) gradient_accumulation = 5 batch_size = 1 hidden_size = 16 input_shape = [batch_size, hidden_size] weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype( np.float32) input_data = np.random.normal( 0, 0.02, [gradient_accumulation] + input_shape).astype(np.float32) builder = popart.Builder() x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") with builder.virtualGraph(0), builder.pipelineStage(0): weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") x = builder.aiOnnx.matmul([x_in, weight_1]) with builder.virtualGraph(1), builder.pipelineStage(1): weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2") x_recomp = builder.aiOnnx.matmul([x, weight_2]) # This MatMul should be recomputed x = builder.checkpointOutput([x_recomp])[0] weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3") # This MatMul should not be recomputed x_no_recomp = builder.aiOnnx.matmul([x, weight_3]) l1 = builder.aiGraphcore.l1loss([x_no_recomp], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [l1]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip opts.autoRecomputation = popart.RecomputationType.Pipeline opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) ''' Verify the the matmul in the main graphs is correct''' ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) for op in ir["maingraph"]: if x_recomp in map(lambda out: out["name"], op["outputs"]): assert op["attributes"]["recompute"] == "YES" elif x_no_recomp in map(lambda out: out["name"], op["outputs"]): assert op["attributes"]["recompute"] == "NO"
def get_model_anchors(doSharding, doPipelining, batchesPerStep, doTraining, replicated_graph_count=1, doProfiling=False, doDropout=False, doGradientAccl=False, acclSteps=1, doDevicex=True, anchorRestoredTensors=False, returnRawInput=False): np.random.seed(seed=1) builder = popart.Builder() batchSize = 16 microBatchSize = batchSize // acclSteps shape_d0 = [microBatchSize, 2, 4, 4] shape_l0 = [microBatchSize] d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0)) data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32) w0 = builder.addInitializedInputTensor(data_w0) l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0)) s0 = builder.aiOnnx.sin([d0], "s0") e0 = builder.aiOnnx.exp([s0], "e0") c0 = builder.aiOnnx.conv([e0, w0], dilations=[1, 1], pads=[1, 1, 1, 1], strides=[1, 1], debugContext="c0") r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32]) if doDropout: do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0] out = builder.aiOnnx.softmax([do0], axis=1, debugContext="sfm") else: out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm") nll = builder.aiGraphcore.nllloss([out, l0], reduction=popart.ReductionType.Sum) art = popart.AnchorReturnType("All") anchor_map = {nll: art, w0: art, e0: art} if doTraining is True: anchor_map[popart.reservedGradientPrefix() + d0] = art if doPipelining is True and anchorRestoredTensors is True: anchor_map[popart.reservedRestoredPrefix() + e0] = art anchor_map[d0] = art anchor_map[popart.reservedRestoredPrefix() + d0] = art opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enablePipelining = doPipelining opts.enableGradientAccumulation = doGradientAccl opts.accumulationFactor = acclSteps opts.enableStochasticRounding = False if doSharding is False: numIpus = 1 * replicated_graph_count else: opts.virtualGraphMode = popart.VirtualGraphMode.Manual numIpus = 2 * replicated_graph_count builder.virtualGraph(s0, 0) builder.virtualGraph(e0, 0) builder.virtualGraph(c0, 0) builder.virtualGraph(r0, 1) if doDropout: builder.virtualGraph(do0, 1) builder.virtualGraph(out, 1) builder.virtualGraph(nll, 1) if replicated_graph_count > 1: opts.replicatedGraphCount = replicated_graph_count opts.enableReplicatedGraphs = True device = tu.create_test_device(numIpus=numIpus) if doTraining is True: session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), loss=nll, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) else: session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=popart.DataFlow( batchesPerStep, anchor_map), userOptions=opts, deviceInfo=device) if doDevicex is False: return None session.prepareDevice() anchors = session.initAnchorArrays() session.setRandomSeed(0) classes = np.prod(shape_d0) // (batchSize * batchesPerStep) label = np.random.randint(low=0, high=classes, size=shape_l0).astype(np.int32) # With all options enabled return anchors are of the shape: # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape] if acclSteps > 1: shape_d0.insert(0, acclSteps) label = label.reshape([acclSteps, -1]) if batchesPerStep > 1: shape_d0.insert(0, batchesPerStep) label = np.repeat(label[np.newaxis], batchesPerStep, 0) data = np.random.random_sample(shape_d0).astype(np.float32) # This is a slightly odd case - we want the same data to be input for both # replicated graphs, but the dimension we need to repeat on is either the # first or second (the replication dimension) depending on whether we # have gradient accumulation enabled. # If we are not testing, this is a lot simpler as we can split samples however # we want. if replicated_graph_count > 1: if acclSteps > 1: data = np.repeat(data[np.newaxis], replicated_graph_count, 2) label = label.reshape([replicated_graph_count, -1]) else: data = np.repeat(data[np.newaxis], replicated_graph_count, 1) label = label.reshape([replicated_graph_count, -1]) inputs = {d0: data, l0: label} stepio = popart.PyStepIO(inputs, anchors) stepio.enableRuntimeAsserts(False) session.weightsFromHost() session.run(stepio) if doProfiling is True: from gcprofile import save_popart_report save_popart_report(session) if returnRawInput is True: anchors["input_raw"] = data return anchors
# Let's create a known tensor, but with an undefined shape y = builder.aiOnnx.conv([leaky_relu, w], dilations=[1, 1], pads=[padding] * 4, strides=[1, 1]) l1 = builder.aiGraphcore.l1loss([y], 1.0) proto = builder.getModelProto() art = popart.AnchorReturnType("All") # Describe how to run the model dataflow = popart.DataFlow(1, {y: art, leaky_relu: art, w: art, l1: art}) # Create a session to compile and execute the graph options = popart.SessionOptions() device = popart.DeviceManager().createIpuModelDevice({}) session = popart.TrainingSession(fnModel=proto, dataFlow=dataflow, loss=l1, optimizer=popart.ConstSGD(0.001), userOptions=options, deviceInfo=device) # Compile graph session.prepareDevice() # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights onto the IPU session.weightsFromHost() # Generate some random input data.
def run_test(enablePipelining): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor( popart.TensorInfo("FLOAT", input_data.shape[1::])) w0 = builder.addInitializedInputTensor(weight_data_0) w1 = builder.addInitializedInputTensor(weight_data_1) w2 = builder.addInitializedInputTensor(weight_data_2) o0 = builder.aiOnnx.matmul([i1, w0]) if enablePipelining: builder.virtualGraph(o0, 0) o1 = builder.aiOnnx.matmul([o0, w1]) if enablePipelining: builder.virtualGraph(o1, 1) o2 = builder.aiOnnx.matmul([o1, w2]) if enablePipelining: builder.virtualGraph(o2, 2) o2l1 = builder.aiGraphcore.l1loss([o2], 0.1) if enablePipelining: builder.virtualGraph(o2l1, 2) proto = builder.getModelProto() anchorId = popart.reservedDefaultScaledLearningRate0Prefix() + "FLOAT" # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow(bps, [anchorId]) optimizer = popart.SGD({"defaultLearningRate": (1.0, False)}) opts = popart.SessionOptions() if enablePipelining: opts.virtualGraphMode = popart.VirtualGraphMode.Manual opts.enablePipelining = enablePipelining numIPUs = 1 if enablePipelining: numIPUs = 3 session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, loss=o2l1, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=numIPUs)) session.prepareDevice() anchors = session.initAnchorArrays() inputs = {i1: input_data} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() # run 2 steps, changing the optimizer halfway through result = [] session.run(stepio) result.append(np.copy(anchors[anchorId])) session.updateOptimizerFromHost( popart.SGD({"defaultLearningRate": (0.5, False)})) session.run(stepio) result.append(np.copy(anchors[anchorId])) return result
def test_virtual_graph4(): builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) with builder.virtualGraph(3): o1 = builder.aiOnnx.add([i1, i2]) o1l1 = builder.aiGraphcore.l1loss([o1], 0.1) o2 = builder.aiOnnx.add([i3, o1]) o2l1 = builder.aiGraphcore.l1loss([o2], 0.1) with builder.virtualGraph(2): o3 = builder.aiOnnx.mul([i1, i3]) o3l1 = builder.aiGraphcore.l1loss([o3], 0.1) with builder.virtualGraph(3): loss = builder.aiOnnx.sum([o1l1, o2l1, o3l1]) proto = builder.getModelProto() # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow( 1, { o1: popart.AnchorReturnType("All"), o2: popart.AnchorReturnType("All"), o3: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i1: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i2: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i3: popart.AnchorReturnType("All") }) optimizer = popart.ConstSGD(0.01) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual s = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, loss=loss, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=4)) s.prepareDevice() anchors = s.initAnchorArrays() data1 = np.ones([1], dtype=np.float32) data2 = np.ones([1], dtype=np.float32) data3 = np.ones([1], dtype=np.float32) inputs = {i1: data1, i2: data2, i3: data3} stepio = popart.PyStepIO(inputs, anchors) s.run(stepio) s.weightsFromHost()
def test_virtual_graph3(): popart.getLogger().setLevel("TRACE") builder = popart.Builder() i1 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i2 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i3 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) i4 = builder.addInputTensor(popart.TensorInfo("FLOAT", [1])) with builder.virtualGraph(3): o1 = builder.aiOnnx.add([i1, i2]) o2 = builder.aiOnnx.add([i3, i4]) with builder.virtualGraph(2): o3 = builder.aiOnnx.add([o1, o2]) o = builder.aiOnnx.add([i1, o3]) o = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() # Need to anchor the output of the backward pass to stop it being pruned dataFlow = popart.DataFlow( 1, { o: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i1: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i2: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i3: popart.AnchorReturnType("All"), popart.reservedGradientPrefix() + i4: popart.AnchorReturnType("All") }) optimizer = popart.SGD({"defaultLearningRate": (0.01, True)}) opts = popart.SessionOptions() opts.virtualGraphMode = popart.VirtualGraphMode.Manual s = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, loss=o, optimizer=optimizer, userOptions=opts, deviceInfo=tu.create_test_device(numIpus=4)) s.prepareDevice() anchors = s.initAnchorArrays() data1 = np.ones([1], dtype=np.float32) data2 = np.ones([1], dtype=np.float32) data3 = np.ones([1], dtype=np.float32) data4 = np.ones([1], dtype=np.float32) inputs = {i1: data1, i2: data2, i3: data3, i4: data4} stepio = popart.PyStepIO(inputs, anchors) s.run(stepio) s.weightsFromHost()
def run_model(tmpdir, batches_per_step, accum_factor, replicas, tile_set, exchange_strategy): size = 64 proto, inputs, weights, labels, dataFlow, loss, sum = get_model( size, batches_per_step, 4, 1, tile_set, exchange_strategy) opts = popart.SessionOptions() opts.enableExplicitMainLoops = True opts.useHostCopyOps = True opts.instrumentWithHardwareCycleCounter = False opts.virtualGraphMode = popart.VirtualGraphMode.Auto # Both true & false should work - testing with false to avoid # host-cycle-overhead opts.rearrangeAnchorsOnHost = False opts.rearrangeStreamsOnHost = False # Set session options to generate the report tu.set_autoreport_options(opts, tmpdir, output_execution_profile=True) if accum_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accum_factor if tile_set == popart.TileSet.IO: opts.numIOTiles = 128 else: opts.numIOTiles = 0 if replicas > 1: opts.enableReplicatedGraphs = True opts.replicatedGraphCount = replicas pat = popart.Patterns(popart.PatternsLevel.Default) session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=popart.ConstSGD(1e-6), patterns=pat, # Trying to use less than all the tiles throw an error like # popart_core.poplar_exception: Trying to access tile 72 on IPU # 0 but the virtual graph only covers the following tiles on # that IPU: 0-63 # The error happens in a call to poplar made by gcl::perIPUTiles. deviceInfo=tu.create_test_device(numIpus=replicas, tilesPerIPU=tu.USE_ALL_TILES)) anchors = session.initAnchorArrays() session.prepareDevice() np.random.seed(224488) session.weightsFromHost() warmup_iterations = 1 calc_iterations = 1 for i in range(warmup_iterations + calc_iterations): datainputs = { input: (np.random.normal( 0, 0.05, (replicas * batches_per_step * accum_factor, 1, size, size)).astype(np.float32)) for input in inputs } datainputs[labels] = np.random.randint( 0, size, (replicas * batches_per_step * accum_factor, 1, size)) stepio = popart.PyStepIO(datainputs, anchors) session.run(stepio) session.weightsToHost() weights_data = { w: np.zeros((1, size, size), dtype=np.float32) for w in weights } weights_read = popart.PyWeightsIO(weights_data) session.readWeights(weights_read) for w in weights_data: assert np.count_nonzero(np.isnan(weights_data[w])) == 0 report = session.getReport() overlapPercentage = get_compute_io_overlap_percentage( report, warmup_iterations) return overlapPercentage, weights_data
def run_test(mode=None, verify=None): builder = popart.Builder() def norm(input_x): gamma = builder.addInitializedInputTensor( np.ones(hidden_size, np.float32), "Gamma") beta = builder.addInitializedInputTensor( np.zeros(hidden_size, np.float32), "Beta") return builder.aiGraphcore.groupnormalization( [input_x, gamma, beta], 1)[0] x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2") weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3") with builder.virtualGraph(0), builder.pipelineStage(0): x_0 = builder.aiOnnx.matmul([x_in, weight_1]) x_0 = norm(x_0) # If recomputeOutputs was used directly on `x_0` all 3 outputs # of groupnormalization would be stashed. # By using a checkpointOutput only 1 output will be stashed and the # rest will be recomputed. x_0 = builder.checkpointOutput([x_0])[0] x_1 = builder.aiOnnx.matmul([x_0, weight_2]) x_1 = norm(x_1) x_1 = builder.aiOnnx.add([x_0, x_1]) # This checkpoint should be redundant as x_1 will be stashed # at the start of stage1 on ipu1. x_1 = builder.checkpointOutput([x_1])[0] with builder.virtualGraph(1), builder.pipelineStage(1): o = builder.aiOnnx.matmul([x_1, weight_3]) l1 = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [ o, popart.reservedGradientPrefix() + weight_1, popart.reservedGradientPrefix() + weight_2, popart.reservedGradientPrefix() + weight_3, ]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.optimizerStateTensorLocationSettings.location.storage = popart.TensorStorage.OffChip if mode is not None: opts.autoRecomputation = mode opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.Adam({}), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() inputs = {x_in: input_data} stepio = popart.PyStepIO(inputs, anchors) for _ in range(10): session.run(stepio) if verify is not None: verify(session, x_0) return anchors
def matmul_avail_memory(capfd, apply_to_conv=True, avail_mem_prop=0.9): os.environ["POPLIBS_LOG_LEVEL"] = "DEBUG" builder = popart.Builder() input_shape = popart.TensorInfo("FLOAT", [2, 4]) weight_shape = popart.TensorInfo("FLOAT", [4, 8]) weight_data = np.ones(weight_shape.shape(), np.float32) input_ = builder.addInputTensor(input_shape) weights = builder.addInitializedInputTensor(weight_data) act = builder.aiOnnx.matmul([input_, weights]) o = builder.aiOnnx.relu([act]) loss = builder.aiGraphcore.identityloss([o]) # Apply the setAvailableMemoryProportion to the matmul if apply_to_conv: builder.setAvailableMemoryProportion(act, avail_mem_prop) # For the test_conv_avail_memory_error_2 test we try to apply the # setAvailableMemoryProportion to the relu op defined above, rather # than the expected convolution op, and expect an error. else: builder.setAvailableMemoryProportion(o, avail_mem_prop) anchor_names = [ o, popart.reservedGradientPrefix() + input_, popart.reservedGradientPrefix() + weights ] training_dataFlow = popart.DataFlow( 1, { anchor_names[0]: popart.AnchorReturnType("All"), anchor_names[1]: popart.AnchorReturnType("All"), anchor_names[2]: popart.AnchorReturnType("All") }) opts = popart.SessionOptions() opts.constantWeights = False # Allow the weights to be updated # Create the device device = tu.create_test_device(1, opts={"compileIPUCode": True}) device.attach() # Prepare the input data input_data = np.random.random_sample(input_shape.shape()).astype( np.float32) # Prepare the Training session training_session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=training_dataFlow, loss=loss, optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=device) # Compile the training graph training_session.prepareDevice() # Run the training session training_session.weightsFromHost() training_anchors = training_session.initAnchorArrays() training_inputs = {input_: input_data} training_session.run(popart.PyStepIO(training_inputs, training_anchors)) captured = capfd.readouterr() os.environ["POPLIBS_LOG_LEVEL"] = "NONE" return captured.err
def test_pipeline_stage_merging(): np.random.seed(0) # With 3 stages the minimum pipeline cycles is 5 # With 2 stages the minimum pipeline cycles is 3 # So if the consecutive stages aren't fused an error will be thrown. gradient_accumulation = 3 batch_size = 1 hidden_size = 16 input_shape = [batch_size, hidden_size] weight_data = np.random.normal(0, 0.02, [hidden_size, hidden_size]).astype( np.float32) input_data = np.random.normal(0, 0.02, [gradient_accumulation] + input_shape).astype(np.float32) builder = popart.Builder() x_in = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape), "x_in") weight_1 = builder.addInitializedInputTensor(weight_data, "weight_1") weight_2 = builder.addInitializedInputTensor(weight_data, "weight_2") weight_3 = builder.addInitializedInputTensor(weight_data, "weight_3") # Pipelining should combine stage 0 and 1. with builder.virtualGraph(0), builder.pipelineStage(0): x_0 = builder.aiOnnx.matmul([x_in, weight_1]) with builder.virtualGraph(0), builder.pipelineStage(1): x_1 = builder.aiOnnx.matmul([x_0, weight_2]) with builder.virtualGraph(1), builder.pipelineStage(2): o = builder.aiOnnx.matmul([x_1, weight_3]) l1 = builder.aiGraphcore.l1loss([o], 0.1) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, [o]) opts = popart.SessionOptions() opts.enableOutlining = False opts.enablePipelining = True opts.enableGradientAccumulation = True opts.accumulationFactor = gradient_accumulation opts.autoRecomputation = popart.RecomputationType.Pipeline opts.virtualGraphMode = popart.VirtualGraphMode.Manual session = popart.TrainingSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=l1, optimizer=popart.ConstSGD(1e-9), deviceInfo=tu.create_test_device( numIpus=2, opts={"compileIPUCode": False})) ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) stashes = [op for op in ir["maingraph"] if op["type"] == "Stash"] stashedTensors = [stash["inputs"][0]["name"] for stash in stashes] assert {'x_in'} == set(stashedTensors)
def test_reset_host_weights_with_extra_tensor_in_onnx_model(): """ 1. Create a training session, and a corresponding validation session 2. The training session must contain some feauture that means when writing the ONNX model back to the host, it contains extra initializers compared with the original (builder-generated) model. In this case we achieve this by using an SGD optimizer with momentum. 3. Try resetting the weights of the validation session using the ONNX model with the additional momentum tensor (call resetHostWeights) 4. Observe that a PopART exception is thrown 5. Try again, but with ignoreWeightsInModelWithoutCorrespondingHostWeight. 6. Observe that it succeeds """ def getModelWithRandomWeights(): builder = popart.Builder() dShape = [2, 2] i0 = builder.addInputTensor(popart.TensorInfo("FLOAT", dShape)) wData = np.random.rand(*dShape).astype(np.float32) w0 = builder.addInitializedInputTensor(wData) o = builder.aiOnnx.matmul([i0, w0]) loss = builder.aiGraphcore.l1loss([o], 0.1) builder.addOutputTensor(loss) return builder device = tu.create_test_device() tr_builder = getModelWithRandomWeights() o = tr_builder.getOutputTensorIds()[0] # 1. & 2. # Training tr_opt = popart.SGD({"defaultMomentum": (0.01, True)}) tr_sess = popart.TrainingSession(fnModel=tr_builder.getModelProto(), dataFlow=popart.DataFlow(1, []), loss=o, optimizer=tr_opt, deviceInfo=device) tr_sess.prepareDevice() tmpfile = os.path.join(tempfile.mkdtemp(), "tr_model.onnx") tr_sess.modelToHost(tmpfile) # Validation (with different model proto weights) va_builder = getModelWithRandomWeights() va_opts = popart.SessionOptions() va_opts.constantWeights = False va_sess = popart.InferenceSession(fnModel=va_builder.getModelProto(), dataFlow=popart.DataFlow(1, [o]), deviceInfo=device, userOptions=va_opts) va_sess.prepareDevice() # 3. Try reset validation weights with training weights wId = [ w for w in va_builder.getInputTensorIds() if va_builder.isInitializer(w) ][0] missing_tensor_name = popart.reservedAcclToAccumulatorPrefix( ) + popart.reservedGradientPrefix() + wId with pytest.raises(popart.popart_exception) as e_info: va_sess.resetHostWeights(tmpfile) # 4. assert e_info.value.args[ 0] == "resetWeights, no tensor '" + missing_tensor_name + "' in tensors" # 5. & 6. Try again, but this time ignore the missing tensor va_sess.resetHostWeights( tmpfile, ignoreWeightsInModelWithoutCorrespondingHostWeight=True)
def test_manual_serialization(): # Basic model: # # X: data input if shape (N, C0) # W: weight input of shape (C0, C1) # # Y = matmul(X, W) # Z = relu(Y) # loss = l1Loss(Z) # # With array dimensions N = 12 C0 = 244 C1 = 286 # In this test, we manually serialise the matmul, converting # matmul ((N,C0) , (C0,C1)) # # into a sequence of factor-f smaller matmuls # matmul (N,C0/f),(C0/f,C1)) # # reapeated and accumulated f times, where f is f = 4 assert (C0 % f == 0) # Constructing the model builder = popart.Builder() # NOTE: T22702 For some seeds this test fails. np.random.seed(0) wVals = np.array(npr.randn(C0, C1), dtype=np.float32) W = builder.addInitializedInputTensor(wVals) xInfo = popart.TensorInfo("FLOAT", [N, C0]) X = builder.addInputTensor(xInfo) axesV = np.array([0, 1]).astype(np.int32) axes = builder.addInitializedInputTensor(axesV) for i in range(f): # the lower index of the i'th slice lwr = int(i * C0 / f) # the upper index of the i'th slice upp = int((i + 1) * C0 / f) # Take a slice of size (N,C0/f) out of X s0 = builder.addInitializedInputTensor( np.array([0, lwr]).astype(np.int32)) e0 = builder.addInitializedInputTensor( np.array([N, upp]).astype(np.int32)) X_slice = builder.aiOnnx.slice([X, s0, e0, axes]) # Take a slice of size (C0/f,C1) out of W s1 = builder.addInitializedInputTensor( np.array([lwr, 0]).astype(np.int32)) e1 = builder.addInitializedInputTensor( np.array([upp, C1]).astype(np.int32)) W_slice = builder.aiOnnx.slice([W, s1, e1, axes]) # Multiply the slices together, and accumulate as necessary mm_part = builder.aiOnnx.matmul([X_slice, W_slice]) if i == 0: Y = mm_part else: Y = builder.aiOnnx.add([mm_part, Y]) # Finally, the non-linearity Z = builder.aiOnnx.relu([Y]) # This boiler-plate is currently necessary with opset-10 slice graph_transformer = popart.GraphTransformer(builder.getModelProto()) graph_transformer.convertAllFixedPointInitializersToConstants() builder = popart.Builder(graph_transformer.getModelProto()) l1 = builder.aiGraphcore.l1loss([Z], 0.2) dataFlow = popart.DataFlow(1, {}) device = tu.create_test_device() userOptions = popart.SessionOptions() # To obtain the final dot graph, uncomment this: # userOptions.dotChecks = {"Final"}; patterns = popart.Patterns() session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, optimizer=popart.SGD( {"defaultLearningRate": (0.1, True)}), loss=l1, patterns=patterns, userOptions=userOptions, deviceInfo=device) session.prepareDevice() session.weightsFromHost() inputVals = np.array(npr.randn(1 * N * C0), dtype=np.float32) stepio = popart.PyStepIO({X: inputVals}, {}) session.run(stepio) session.weightsToHost() w0R = np.array(-777.0 * np.ones(C0 * C1), dtype=np.float32) weightsRead = popart.PyWeightsIO({W: w0R}) session.readWeights(weightsRead) # A pytorch version to confirm numerical correctness: class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.w0 = torch.nn.Parameter(torch.from_numpy(wVals.copy())) def forward(self, x): return torch.relu(torch.matmul(x, self.w0)) net = Net() optimizer = optim.SGD(net.parameters(), lr=0.1) out = net(torch.from_numpy(inputVals.reshape([N, C0]))) loss = 0.2 * torch.mean(torch.abs(out)) optimizer.zero_grad() loss.backward() optimizer.step() baseline0 = np.sum( np.abs(net.w0.detach().numpy().flatten() - wVals.flatten())) baseline1 = np.sum(np.abs(w0R - wVals.flatten())) error = np.sum(np.abs(np.abs(net.w0.detach().numpy().flatten() - w0R))) assert (error / (baseline0 + baseline1) < 1e-6)
def main(argv): FLAGS = flags.FLAGS print(f"micro batch size is {FLAGS.micro_batch_size}") print(f"batch size is {FLAGS.batch_size}") print(f"batches_per_step is {FLAGS.batches_per_step}") proto, data, outputs, output_id = graph_builder() print(f"Model: {FLAGS.model_name}") if not FLAGS.synthetic: print(f"Data_dir: {FLAGS.data_dir}") else: print(f"Using synthetic data") print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}") print(f"num_workers: {FLAGS.num_workers}") print(f"batches per step: {FLAGS.batches_per_step}") dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if FLAGS.synthetic: options.syntheticDataMode = popart.SyntheticDataMode.Zeros options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count # Configure precision of convolutions and MatMuls if FLAGS.half_partials: options.convolutionOptions = {'partialsType': 'half'} options.partialsTypeMatMuls = "half" # Select a device deviceManager = popart.DeviceManager() device = deviceManager.acquireAvailableDevice(1) print(f"{device}\n") if device is None: raise Exception("Not enough IPUs available.") session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) print("Compiling...") start = time.time() try: session.prepareDevice() except popart.PrepareDeviceException as e: import gcprofile gcprofile.save_popart_report(session, exception=e) sys.exit(1) compilation_duration = time.time() - start print("Time to compile: {:.3f} seconds\n".format(compilation_duration)) # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimisation parameters onto the device session.weightsFromHost() def report_time(duration, data_duration=None, compute_duration=None): report_string = "Total {:<8.3} sec.".format(duration) if data_duration: report_string += " Preprocessing {:<8.3} sec ({:4.3}%).".format( data_duration, 100 * (data_duration / duration)) if compute_duration: report_string += " Compute {:<8.3} sec ({:4.3}%).".format( compute_duration, 100 * (compute_duration / duration)) report_string += " {:5f} images/sec.".format( int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration)) print(report_string) if FLAGS.report_hw_cycle_count: print("Hardware cycle count per 'run':", session.getCycleCount()) print("Executing...") average_batches_per_sec = 0 # Run start = time.time() durations = [] if FLAGS.synthetic: for i in range(FLAGS.iterations): stepio = popart.PyStepIO(data, anchors) data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations) else: for d in data: stepio = popart.PyStepIO(d, anchors) # Calc data duration data_time = time.time() data_d = data_time - start # Run compute session.run(stepio) # Calc compute duration results = anchors[output_id] comp_d = time.time() - data_time # Calc total duration t = time.time() - start report_time(t, data_d, comp_d) durations.append(t) start = time.time() duration = np.mean(durations)
def run_py(proto: onnx.ModelProto, data: Mapping[str, np.ndarray], outputs: Optional[Union[str, Iterable[str]]], loss: Optional[str] = None, optimizer: Optional[popart.Optimizer] = None, patterns: Optional[popart.Patterns] = None, user_options: Optional[Mapping[str, Any]] = None, skip_execution: bool = False): batches_per_step = 1 outputs = make_tuple(outputs) # Setting up the Session data_flow = popart.DataFlow( batches_per_step, {output: popart.AnchorReturnType("ALL") for output in outputs}) if user_options is None: user_options = {} options = popart.SessionOptions() options.reportOptions = {"showVarStorage": "true"} options.enableStochasticRounding = False options.constantWeights = True options.outlineThreshold = 10.0 for key, value in user_options.items(): if key not in ["batchSerializationFactor", "executionPhases"]: setattr(options, key, value) replicas = user_options.get("replicatedGraphCount", 1) request_ipus = pow(2, math.ceil(math.log2(replicas))) device = tu.create_test_device(numIpus=request_ipus) print("Compiling graph") if optimizer is not None: session = popart.TrainingSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, loss=loss, optimizer=optimizer, patterns=patterns) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=data_flow, userOptions=options, patterns=patterns) if skip_execution: device.detach() return session # Compile the Poplar Graph. If it fails, return the memory stats try: session.prepareDevice() except popart.session.OutOfMemoryException as e: device.detach() raise e print("Compilation complete") session.weightsFromHost() # NOTE: If we ever use a model with random ops, we would need to call this # here, using the same seed given to numpy. # session.setRandomSeed(1984) anchors = session.initAnchorArrays() rf = user_options.get("replicatedGraphCount") if rf is not None and rf > 1: data = {k: np.repeat(v[np.newaxis], rf, 0) for k, v in data.items()} # Add a gradient accumulation factor dimension if needed af = user_options.get("accumulationFactor") if af is not None and af > 1: data = {k: np.repeat(v[np.newaxis], af, 0) for k, v in data.items()} stepio = popart.PyStepIO(data, anchors) session.run(stepio) with tempfile.TemporaryDirectory() as tmp: file_path = os.path.join(tmp, "model.onnx") session.modelToHost(file_path) post_proto = onnx.load(file_path) # Release device device.detach() return (anchors[output] for output in outputs), post_proto, outputs
def train(opts): # Do not require the mnist data to be present if running with synthetic data train_data, train_labels, test_data, test_labels = load_dummy(opts) \ if opts.syn_data_type in ["random_normal", "zeros"] else load_mnist() if not opts.test_mode: max_value = len(test_data) // opts.batch_size if max_value < opts.batches_per_step: print("(batches-per-step * batch-size) is larger than test set!\n" " Reduced batches-per-step to: {}\n".format(max_value)) opts.batches_per_step = max_value training_set = DataSet(opts.batch_size, opts.batches_per_step, train_data, train_labels) test_set = DataSet(opts.batch_size, opts.batches_per_step, test_data, test_labels) print("Creating ONNX model.") proto, data_in, labels_in, output, loss = create_model( opts.samples_per_device) # Describe how to run the model anchor_desc = { output: popart.AnchorReturnType("ALL"), loss: popart.AnchorReturnType("ALL") } dataFlow = popart.DataFlow(opts.batches_per_step, anchor_desc) # Options userOpts = popart.SessionOptions() # The validation graph by default will be optimized to change all variables to constants # This prevents that, which allows for checkpoints to be loaded into the model without recompiling userOpts.constantWeights = False # If requested, setup synthetic data if opts.syn_data_type in ["random_normal", "zeros"]: print("Running with Synthetic Data Type '{}'".format( opts.syn_data_type)) if opts.syn_data_type == "random_normal": userOpts.syntheticDataMode = popart.SyntheticDataMode.RandomNormal elif opts.syn_data_type == "zeros": userOpts.syntheticDataMode = popart.SyntheticDataMode.Zeros # Enable auto-sharding if opts.num_ipus > opts.replication_factor: userOpts.virtualGraphMode = popart.VirtualGraphMode.Auto # Enable pipelining if opts.pipeline: userOpts.enablePipelining = True # Enable replication if opts.replication_factor > 1: userOpts.enableReplicatedGraphs = True userOpts.replicatedGraphCount = opts.replication_factor # A single device is shared between training and validation sessions device = get_device(opts.num_ipus, opts.simulation) training = init_session(proto, loss, dataFlow, userOpts, device, training=True) validation = init_session(proto, loss, dataFlow, userOpts, device, training=False) # Make weight transfer file _, onnx_file_name = tempfile.mkstemp() print("Running training loop.") for i in range(opts.epochs): # Training if i > 0: training.session.resetHostWeights(onnx_file_name) training.session.weightsFromHost() for step, (data, labels) in enumerate(training_set): stepio = popart.PyStepIO({ data_in: data, labels_in: labels }, training.anchors) start = time() training.session.run( stepio, 'Epoch ' + str(i) + ' training step' + str(step)) if opts.test_mode == "training": log_run_info(training, start, opts) training.session.modelToHost(onnx_file_name) if not opts.validation_final_epoch or i == opts.epochs - 1: aggregated_loss = 0 aggregated_accuracy = 0 validation.session.resetHostWeights(onnx_file_name) validation.session.weightsFromHost() # Evaluation for step, (data, labels) in enumerate(test_set): stepio = popart.PyStepIO({ data_in: data, labels_in: labels }, validation.anchors) start = time() validation.session.run( stepio, 'Epoch ' + str(i) + ' evaluation step ' + str(step)) if opts.test_mode == "inference": log_run_info(validation, start, opts) # Loss aggregated_loss += np.mean(validation.anchors[loss]) # Accuracy results = np.argmax( validation.anchors[output].reshape( [test_set.inputs_per_step, 10]), 1) num_correct = np.sum( results == labels.reshape([test_set.inputs_per_step])) aggregated_accuracy += num_correct / test_set.inputs_per_step # Log statistics aggregated_loss /= len(test_set) aggregated_accuracy /= len(test_set) print("Epoch #{}".format(i + 1)) print(" Loss={0:.4f}".format(aggregated_loss)) print(" Accuracy={0:.2f}%".format(aggregated_accuracy * 100)) # Remove weight transfer file os.remove(onnx_file_name)
def run(benchmark, opts): proto, data, outputs, losses, optimizer = benchmark.graph_builder(opts) if opts.save_graph: with open('model.onnx', "wb") as f: f.write(proto) print("Written to file: model.onnx") dataFlow = popart.DataFlow(opts.batches_per_step, outputs) # Create a session to compile and execute the graph options = popart.SessionOptions() if not opts.use_generated_data: options.syntheticDataMode = popart.SyntheticDataMode.Zeros options.instrumentWithHardwareCycleCounter = opts.report_hw_cycle_count options.engineOptions = { "debug.instrumentCompute": "true" if opts.report else "false" } if opts.convolution_options: options.convolutionOptions = json.loads(opts.convolution_options) if opts.shards > 1: if opts.auto_sharding: options.virtualGraphMode = popart.VirtualGraphMode.Auto else: options.virtualGraphMode = popart.VirtualGraphMode.Manual options.enablePipelining = opts.pipeline # Select a device deviceManager = popart.DeviceManager() if opts.simulation: deviceOptions = { "compileIPUCode": True, 'numIPUs': opts.shards, "tilesPerIPU": 1216 } device = deviceManager.createIpuModelDevice(deviceOptions) else: device = deviceManager.acquireAvailableDevice(opts.shards) if device is None: raise OSError("Failed to acquire IPU.") if opts.mode == 'train': session = popart.TrainingSession(fnModel=proto, loss=losses, deviceInfo=device, optimizer=optimizer, dataFlow=dataFlow, userOptions=options) else: session = popart.InferenceSession(fnModel=proto, deviceInfo=device, dataFlow=dataFlow, userOptions=options) print("Compiling...") start = time.time() session.prepareDevice() compilation_duration = time.time() - start print("Duration: {:.3f} seconds\n".format(compilation_duration)) if opts.tensor_tile_mapping: with open("tile_mapping.json", 'w') as f: json.dump(session.getTensorTileMap(), f) print("Written to file: tile_mapping.json") # Create buffers to receive results from the execution anchors = session.initAnchorArrays() # Copy weights and optimization parameters onto the device session.weightsFromHost() # Add a batches_per_step dimension if needed if opts.batches_per_step > 1: data = { k: np.repeat(v[np.newaxis], opts.batches_per_step, 0) for k, v in data.items() } stepio = popart.PyStepIO(data, anchors) print("Executing...") average_batches_per_sec = 0 # Steps for __ in range(opts.steps): # Run start = time.time() session.run(stepio) duration = time.time() - start if opts.report: return save_reports(opts, session) average_batches_per_sec += (opts.batches_per_step / duration) / opts.steps report_string = "{:<8.3} sec/itr.".format(duration) report_string += " " + benchmark.iteration_report(opts, duration) print(report_string) if opts.report_hw_cycle_count: print("Hardware cycle count per 'run':", session.getCycleCount()) return compilation_duration, average_batches_per_sec
ip = builder.addInputTensor(data_shape) lb = builder.addInputTensor(lbl_shape) w = builder.addInitializedInputTensor(np.ones([2, 2], np.float16)) b = builder.addInitializedInputTensor(np.ones([2], np.float16)) o = builder.aiOnnx.gemm([ip, w, b], 1., 1., False, False) o = builder.aiOnnx.relu([o]) o = builder.aiOnnx.softmax([o]) o = builder.aiGraphcore.nllloss([o, lb]) dataFlow = popart.DataFlow(1, {o: popart.AnchorReturnType("All")}) # Create a session to compile and the graph for inference #------------------------------------------------------------------------------ inferenceOptions = popart.SessionOptions() # Need to compile the inference graph with variable weights we they can be updated # before execution inferenceOptions.constantWeights = False inferenceSession = popart.InferenceSession( fnModel=builder.getModelProto(), dataFlow=dataFlow, userOptions=inferenceOptions, deviceInfo=popart.DeviceManager().createIpuModelDevice({})) # Compile graph inferenceSession.prepareDevice() # Create buffers to receive results from the execution inferenceAnchors = inferenceSession.initAnchorArrays()
def test(config, iteration, true_scaling, test_case): builder = popart.Builder() w0name = "weight_0" w1name = "weight_1" w2name = "weight_2" input0Shape = [1, 1, 1] input0 = builder.addInputTensor( popart.TensorInfo("FLOAT", input0Shape), "input0") w0data = np.array([test_case[0][0]], dtype=np.float32) w0R = np.empty([1, ], dtype=np.float32) w0Id = builder.addInitializedInputTensor(w0data, w0name) w1data = np.array([test_case[1][0]], dtype=np.float32) w1R = np.empty([1, ], dtype=np.float32) w1Id = builder.addInitializedInputTensor(w1data, w1name) w2data = np.array([test_case[2][0]], dtype=np.float32) w2R = np.empty([1, ], dtype=np.float32) w2Id = builder.addInitializedInputTensor(w2data, w2name) add0 = builder.aiOnnx.add([w0Id, input0]) add1 = builder.aiOnnx.add([w1Id, add0]) add2 = builder.aiOnnx.add([w2Id, add1]) loss = builder.aiGraphcore.l1loss([add2], 1.0, debugPrefix="l1LossVal") builder.addOutputTensor(add2) proto = builder.getModelProto() dataFlow = popart.DataFlow(1, {}) opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.enableGroupedMatmuls = False pat = popart.Patterns(popart.PatternsLevel.Default) device = popart.DeviceManager().acquireAvailableDevice(1) if device is None: raise OSError("Failed to acquire IPU.") # The stage->tensor map would come from the Bert model in reality # (see model.tensors) mock_tensor_map = { 0: [w0Id], 1: [w1Id], 2: [w2Id] } factory = ScheduledOptimizerFactory( config, iteration, tensors=mock_tensor_map) assert_scaled_lr(factory, true_scaling) optimizer_step0 = factory.create() session = popart.TrainingSession( fnModel=proto, dataFlow=dataFlow, userOptions=opts, loss=loss, optimizer=optimizer_step0, patterns=pat, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input_data = np.array([3.1415], dtype=np.float32) stepio = popart.PyStepIO({input0: input_data}, anchors) for step in range(iteration.total_steps): session.run(stepio) session.weightsToHost() weightsRead = popart.PyWeightsIO({w0Id: w0R, w1Id: w1R, w2Id: w2R}) session.readWeights(weightsRead) assert (np.isclose(test_case[0][step+1], w0R)) assert (np.isclose(test_case[1][step+1], w1R)) assert (np.isclose(test_case[2][step+1], w2R)) iteration.count += 1 if factory.should_update(iteration): optimizer_step1 = factory.update_and_create(iteration) assert_scaled_lr(factory, true_scaling) session.updateOptimizerFromHost(optimizer_step1)
def main(args): # Model parameters np.random.seed(1971) input_rows = 28 input_columns = 28 num_classes = 10 batch_size = 2048 input_shape = [batch_size, input_rows * input_columns] labels_shape = [batch_size] # Create model x0, labels, model_proto, anchor_map, loss = create_model( num_features=input_columns * input_rows, num_classes=num_classes, batch_size=batch_size, force_recompute=True if args.recomputing == 'ON' else False) # Save model (optional) if args.export: with open(args.export, 'wb') as model_path: model_path.write(model_proto) # Session options num_ipus = 1 opts = popart.SessionOptions() opts.reportOptions = {"showExecutionSteps": "true"} opts.engineOptions = {"debug.instrument": "true"} if args.recomputing == 'AUTO': opts.autoRecomputation = popart.RecomputationType.Standard # Create session session = popart.TrainingSession( fnModel=model_proto, dataFeed=popart.DataFlow(1, anchor_map), losses=[loss], optimizer=popart.ConstSGD(0.01), userOptions=opts, deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus)) anchors = session.initAnchorArrays() session.prepareDevice() # Synthetic data input data_in = np.random.uniform(low=0.0, high=1.0, size=input_shape).astype(np.float32) labels_in = np.random.randint(low=0, high=num_classes, size=labels_shape).astype(np.int32) # Run session inputs = {x0: data_in, labels: labels_in} stepio = popart.PyStepIO(inputs, anchors) session.weightsFromHost() session.optimizerFromHost() session.run(stepio) # Save report and return session object (optional) if args.report: from gcprofile import save_popart_report save_popart_report(session) if args.test: return session
def get_ir(model_file_name='model.onnx', enable_executionphases=True, enable_matmul_serialization=False, enable_outlining=False, activation_tensor_location_settings=None, weight_tensor_location_settings=None, optimizer_state_tensor_location_settings=None, accumulator_tensor_location_settings=None, tensor_location_setting_override={}, num_layers=3, dsize=48, batch_size=1, num_iterations=1, num_replicas=1, accumulation_factor=2, optimizer=popart.SGD({"defaultLearningRate": (0.5, False)})): np.random.seed(10911) matmul_serialization_mode = 'output_channels' matmul_serialization_factor = 2 builder = popart.Builder() ip = builder.addInputTensor( popart.TensorInfo("FLOAT", [batch_size, dsize, dsize])) def add_layer(index, in_id): w = builder.addInitializedInputTensor( np.random.rand(dsize, dsize).astype(np.float32), f"W{index}") matmul_id = builder.aiOnnx.matmul([in_id, w]) if enable_matmul_serialization: builder.setSerializeMatMul({matmul_id}, matmul_serialization_mode, matmul_serialization_factor) return matmul_id out = ip for i in range(num_layers): with builder.executionPhase(i): out = add_layer(i, out) l1 = builder.aiGraphcore.l1loss([out], 0.1) anchorIds = [] builder.addOutputTensor(out) device = tu.create_test_device(num_replicas * (2 if enable_executionphases else 1), pattern=popart.SyncPattern.Full) dfAnchors = {} for anchorId in anchorIds: dfAnchors.update({anchorId: popart.AnchorReturnType("All")}) opts = popart.SessionOptions() opts.enableOutlining = enable_outlining opts.enableReplicatedGraphs = True if num_replicas > 1 else False opts.replicatedGraphCount = num_replicas if accumulation_factor > 1: opts.enableGradientAccumulation = True opts.accumulationFactor = accumulation_factor if activation_tensor_location_settings is not None: opts.activationTensorLocationSettings = activation_tensor_location_settings if weight_tensor_location_settings is not None: opts.weightTensorLocationSettings = weight_tensor_location_settings if optimizer_state_tensor_location_settings is not None: opts.optimizerStateTensorLocationSettings = optimizer_state_tensor_location_settings if accumulator_tensor_location_settings is not None: opts.accumulatorTensorLocationSettings = accumulator_tensor_location_settings opts.tensorLocationSettingsOverride = tensor_location_setting_override if (enable_executionphases): opts.executionPhaseSettings.phases = num_layers opts.autoRecomputation = popart.RecomputationType.NoRecompute opts.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases opts.explicitRecomputation = False proto = builder.getModelProto() session = popart.TrainingSession(fnModel=proto, dataFlow=popart.DataFlow(1, dfAnchors), optimizer=optimizer, loss=l1, patterns=popart.Patterns( popart.PatternsLevel.All), userOptions=opts, deviceInfo=device) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() for i in range(num_iterations): ip_data = np.random.rand(num_replicas, accumulation_factor, batch_size, dsize, dsize).astype(np.float32) stepio = popart.PyStepIO({ip: ip_data}, anchors) session.run(stepio) ir = json.loads(session._serializeIr(popart.IrSerializationFormat.JSON)) return ir
def create_model_pipelined(bufferStreams: bool = False, pipelining: bool = False) -> Dict: """Create a simple model with optional pipeliing to test buffering streams Args: bufferStreams (bool, optional): Whether bufferStreamCopiesToDevice is on or off. Defaults to False. pipelining (bool, optional): Whether to pipeline the model in 2 parts. Defaults to False. Returns: Dict: A dict of session, stepio, anchors and out tensor name required to run and test the model. """ builder = popart.Builder() data_shape = popart.TensorInfo("FLOAT16", [8, 2]) lbl_shape = popart.TensorInfo("INT32", [8]) ip = builder.addInputTensor(data_shape, "input_data") lb = builder.addInputTensor(lbl_shape, "label") w = builder.addInitializedInputTensor(w_init) b = builder.addInitializedInputTensor(bias_init) gemm = builder.aiOnnx.gemm([ip, w, b], 1., 1., False, False) relu = builder.aiOnnx.relu([gemm]) sm = builder.aiOnnx.softmax([relu]) nll = builder.aiGraphcore.nllloss([sm, lb]) builder.addOutputTensor(sm) art = popart.AnchorReturnType("All") dataFlow = popart.DataFlow(BPS, {sm: art, nll: art}) opts = popart.SessionOptions() opts.enableOutlining = True opts.useHostCopyOps = bufferStreams numIPUs = 1 if pipelining: opts.enablePipelining = True opts.virtualGraphMode = popart.VirtualGraphMode.Manual builder.pipelineStage(gemm, 0) builder.virtualGraph(gemm, 0) builder.pipelineStage(relu, 0) builder.virtualGraph(relu, 0) builder.pipelineStage(sm, 1) builder.virtualGraph(sm, 1) builder.pipelineStage(nll, 1) builder.virtualGraph(nll, 1) numIPUs = 2 device = tu.create_test_device(numIPUs) session = popart.TrainingSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, loss=nll, optimizer=popart.ConstSGD(0.1), userOptions=opts, deviceInfo=device) session.prepareDevice() # 2 host load ops for input check_ops(session, bufferStreams, 2) anchors = session.initAnchorArrays() stepio = popart.PyStepIO({ ip: trainingData, lb: trainingDataLables }, anchors) return { "session": session, "stepio": stepio, "anchors": anchors, "out": sm }
def test_groupHostSync(): builder = popart.Builder() a = builder.addInputTensor(popart.TensorInfo("FLOAT16", [1])) w = builder.addInitializedInputTensor(np.ones([1], np.float16)) o = builder.aiOnnx.add([w, a]) l1 = builder.aiGraphcore.l1loss([o], 0.1) anchor_config = { o: popart.AnchorReturnType("All"), l1: popart.AnchorReturnType("All") } dataFlow = popart.DataFlow(1, anchor_config) options = popart.SessionOptions() options.engineOptions = { "debug.instrumentCompute": "true", "debug.instrumentExternalExchange": "true" } options.groupHostSync = True #The option we are testing options.reportOptions = { "showVarStorage": "true", "showPerIpuMemoryUsage": "true", "showExecutionSteps": "true" } session = popart.InferenceSession(fnModel=builder.getModelProto(), dataFlow=dataFlow, deviceInfo=tu.create_test_device(), userOptions=options) session.prepareDevice() session.weightsFromHost() anchors = session.initAnchorArrays() input_a = np.array([1.4], dtype=np.float16) stepio = popart.PyStepIO({a: input_a}, anchors) session.run(stepio) summaryReport = session.getSummaryReport() lines = summaryReport.split('\n') order = [] first = False countStreams = 0 countSeq = 0 # Analyse a sequence: # default order : # StreamCopy (FromHost) x2 # Add # StreamCopy(ToHost) x2 # Absolute # Reduce # StreamCopy(ToHost) x2 # with the option: # StreamCopy (FromHost) x2 # Add # Absolute # Reduce # StreamCopy(ToHost) x2 for l in lines: if re.search(r"Sequence", l): countSeq += 1 if countSeq >= 7: break if re.search(r"OnTileExecute: 104/Op/Add", l): order.append(1) first = True if re.search(r"OnTileExecute: 101/abs/Op/Absolute", l): order.append(2) if re.search(r"101/add/ReduceExpression", l): order.append(3) if re.search(r"StreamCopy", l) and first: order.append(4) countStreams += 1 # The streamcopy to host should only happen at the end (after ReduceExpression) # Expected list with the option enabled: [1,2,3,4,4] # Expected list without the option: [1,4,4,2,3,4,4] assert (order[1] == 2) assert (order[2] == 3) assert (order[3] == 4) # The number of Streamcopies happening in total # (start counting from Add) should be 2. assert (countStreams == 2)