Exemplo n.º 1
0
def bert_process_data(args, session, labels, data, anchors,
                      losses, predictions, iteration: Iteration,
                      optimizer_factory: ScheduledOptimizerFactory):
    labels_data = [data[label] for label in labels]
    if not np.any([np.any(label) for label in labels_data]):
        # Label may be all padding due to args.vocab_length being smaller than when the data was generated
        return

    stepio = popart.PyStepIO(data, anchors)

    start = time.time()
    session.run(stepio)
    duration = time.time() - start

    if args.gc_profile:
        import gcprofile
        gcprofile.save_popart_report(session)
        sys.exit(0)

    iteration.add_stats(duration, labels_data, anchors, losses, predictions)

    if (iteration.count % iteration.steps_per_log) == 0:
        iteration.report_stats()

    # The following will only be true if:
    #   Learning rate mode is STEP and the current total step counter is in the schedule
    #   Learning rate mode is EPOCH and the current epoch has just changed to one in the schedule
    if optimizer_factory.should_update(iteration):
        optimizer = optimizer_factory.update_and_create(iteration)
        session.updateOptimizer(optimizer)
        session.optimizerFromHost()

    iteration.count += 1
Exemplo n.º 2
0
def bert_process_infer_data(args, session, data, anchors,
                            logits, iteration: Iteration):
    start_times = defaultdict(list)
    end_times = defaultdict(list)

    if args.low_latency_inference and args.task == "SQUAD":
        stepio = create_callback_stepio(data, anchors, start_times, end_times)
    else:
        stepio = popart.PyStepIO(data, anchors)

    start = time.perf_counter()
    session.run(stepio)
    duration = time.perf_counter() - start

    if args.gc_profile:
        import gcprofile
        gcprofile.save_popart_report(session)
        sys.exit(0)

    iteration.durations.append(duration)

    mean_latency, min_latency, max_latency = compute_latency(args, start_times, end_times)

    if (iteration.count % iteration.steps_per_log) == 0:
        status_string = \
            f"Iteration: {iteration.count:6} " \
            f"Duration: {np.average(iteration.durations):6.4f} s " \
            f"Throughput: {np.average(iteration.throughput):6.1f} samples/s"
        if mean_latency is not None:
            status_string += f" Per-sample Latency: {mean_latency} {min_latency} {max_latency} seconds (mean min max)"
        logger.info(status_string)

    iteration.count += 1

    return [anchors[logit] for logit in logits]
Exemplo n.º 3
0
def bert_process_infer_data(args, session, data, anchors, logits,
                            iteration: Iteration, start_times, end_times,
                            stepio):
    if stepio is None:
        stepio = popart.PyStepIO(data, anchors)

    start = time.perf_counter()
    session.run(stepio)
    duration = time.perf_counter() - start
    hw_cycles = session.getCycleCount() if args.report_hw_cycle_count else None

    if args.gc_profile:
        import gcprofile
        gcprofile.save_popart_report(session)
        sys.exit(0)

    iteration.durations.append(duration)

    mean_latency, min_latency, max_latency = compute_latency(
        args, start_times, end_times)

    if (iteration.count % iteration.steps_per_log) == 0:
        status_string = \
            f"Iteration: {iteration.count:6} " \
            f"Duration: {np.average(iteration.durations):6.4f} s " \
            f"Throughput: {np.average(iteration.throughput):6.1f} samples/s"
        if mean_latency is not None:
            status_string += f" Per-sample Latency: {mean_latency} {min_latency} {max_latency} seconds (mean min max)"
        if hw_cycles is not None:
            status_string += f" Cycles: {hw_cycles}"
        logger.info(status_string)

    iteration.count += 1

    return [anchors[logit] for logit in logits]
Exemplo n.º 4
0
def fetch_reports(args, session=None, exception=None, execution=False):
    if session is None and exception is None:
        raise Exception("Must provide session or exception to 'fetch_reports'")

    should_exit = False

    if args.gc_profile:
        import gcprofile
        gcprofile.save_popart_report(session, exception=exception)
        should_exit = execution

    if args.graph_report:
        with open(args.graph_report, "wb") as f:
            if exception is not None:
                graph_report = exception.getGraphReport()
            else:
                graph_report = session.getGraphReport()
            f.write(graph_report)

    if args.execution_report and execution and session is not None:
        with open(args.execution_report, "wb") as f:
            exec_report = session.getExecutionReport()
            f.write(exec_report)
        should_exit = True

    if should_exit:
        sys.exit(0)
Exemplo n.º 5
0
def compile_graph_checked(args, session):
    try:
        start_time = time.time()
        session.prepareDevice()
        end_time = time.time()
        logger.info(f"Compiled. Duration {end_time - start_time} seconds")
    except popart.PrepareDeviceException as e:
        if args.gc_profile:
            import gcprofile
            gcprofile.save_popart_report(session, exception=e)
        raise e
Exemplo n.º 6
0
def init_session(proto,
                 losses,
                 device,
                 dataFlow,
                 options,
                 training,
                 optimizer=None,
                 gcpLogDir=None):

    # Create a session to compile and execute the graph
    if training:
        session_type = "training"
        session = popart.TrainingSession(fnModel=proto,
                                         losses=losses,
                                         deviceInfo=device,
                                         optimizer=optimizer,
                                         dataFeed=dataFlow,
                                         userOptions=options)
    else:
        session_type = "validation"
        session = popart.InferenceSession(fnModel=proto,
                                          losses=losses,
                                          deviceInfo=device,
                                          dataFeed=dataFlow,
                                          userOptions=options)

    try:
        print("Preparing the {} graph".format(session_type))
        with Timer() as prepareTimer:
            session.prepareDevice()
    except popart.PrepareDeviceException as e:
        print("Caught PrepareDeviceException")
        if (gcpLogDir is not None):
            from gcprofile import save_popart_report
            save_popart_report(session, log_dir=gcpLogDir, exception=e)
        raise

    print("{0} graph preparation complete. Duration: {1:.3f} seconds".format(
        session_type.capitalize(), prepareTimer.interval()))

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    return session, anchors
Exemplo n.º 7
0
def create_session_anchors(proto,
                           loss,
                           device,
                           dataFlow,
                           options,
                           training,
                           optimizer=None,
                           profile=False):
    """ Create the desired session and compile the graph """

    if training:
        session_type = "training"
        session = popart.TrainingSession(fnModel=proto,
                                         loss=loss,
                                         deviceInfo=device,
                                         optimizer=optimizer,
                                         dataFlow=dataFlow,
                                         userOptions=options)
    else:
        session_type = "validation"
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=dataFlow,
                                          userOptions=options)

    try:
        logger.info("Preparing the {} graph".format(session_type))
        session.prepareDevice()
        logger.info("{0} graph preparation complete.".format(
            session_type.capitalize(), ))

    except popart.OutOfMemoryException as e:
        logger.warn("Caught Exception while Preparing Device")
        # Dump the profiled result before raising exception and exit
        if profile:
            from gcprofile import save_popart_report
            save_popart_report(session, exception=e)
        raise

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()

    return session, anchors
Exemplo n.º 8
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[str] = None,
           optimizer: Optional[popart.Optimizer] = None,
           patterns: Optional[popart.Patterns] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None,
           skip_execution: bool = False,
           execution_mode: str = 'DEFAULT',
           replication_factor: int = 1,
           replicated_weight_sharding: bool = False,
           num_reps: int = 1):
    outputs = make_tuple(outputs)

    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step, {output: popart.AnchorReturnType("ALL")
                           for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.reportOptions = {"showVarStorage": "true"}
    if replicated_weight_sharding:
        options.weightTensorLocationSettings.location.replicatedTensorSharding.On
        options.optimizerStateTensorLocationSettings.location.replicatedTensorSharding.On
    if replication_factor > 1:
        options.enableReplicatedGraphs = True
        options.replicatedGraphCount = replication_factor
    if execution_mode == 'PHASED':
        options.enableOutlining = True
        options.outlineThreshold = -np.inf
        options.enableOutliningCopyCostPruning = False
        options.autoRecomputation = popart.RecomputationType.Standard
        options.virtualGraphMode = popart.VirtualGraphMode.ExecutionPhases
        options.explicitRecomputation = True
        options.aliasZeroCopy = True
        options.batchSerializationSettings.factor = user_options[
            "batchSerializationFactor"]
        options.executionPhaseSettings.phases = user_options["executionPhases"]
        ipus = 2
    else:
        options.enableGroupedMatmuls = False
        options.enableStochasticRounding = False
        options.constantWeights = True
        options.outlineThreshold = 10.0
        if ipus is not None and ipus > 1:
            options.virtualGraphMode = popart.VirtualGraphMode.Manual
        else:
            ipus = 1

    for key, value in user_options.items():
        if key not in ["batchSerializationFactor", "executionPhases"]:
            setattr(options, key, value)

    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true",
            "opt.internalExchangeOptimisationTarget": "balanced",
        }

    request_ipus = pow(2, math.ceil(math.log2(ipus)))
    request_ipus *= replication_factor
    dm = popart.DeviceManager()
    dm.setOnDemandAttachTimeout(int(1e4))
    device = dm.acquireAvailableDevice(
        request_ipus,
        connectionType=popart.DeviceConnectionType.OnDemand,
        selectionCriterion=popart.DeviceSelectionCriterion.Random)
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFlow=data_flow,
                                         userOptions=options,
                                         loss=loss,
                                         optimizer=optimizer,
                                         patterns=patterns)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFlow=data_flow,
                                          userOptions=options,
                                          patterns=patterns)

    if skip_execution:
        device.detach()
        return session

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.OutOfMemoryException as e:
        if return_stats and log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            gcprofile.save_popart_report(session,
                                         log_dir=log_dir,
                                         exception=e)
        device.detach()
        raise e
    print("Compilation complete")

    session.weightsFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    # Add a gradient accumulation factor dimension if needed
    af = user_options.get("accumulationFactor")
    if af is not None and af > 1:
        data = {k: np.repeat(v[np.newaxis], af, 0)
                for k, v in data.items()}

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {k: np.repeat(v[np.newaxis], batches_per_step, 0)
                for k, v in data.items()}

    for _ in range(num_reps):
        stepio = popart.PyStepIO(data, anchors)
        session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto
Exemplo n.º 9
0
def train_process(opts):

    builder = popart.Builder()

    # Create the data set
    training_dataset = load_dataset(opts, training=True)
    validation_dataset = load_dataset(opts, training=True)

    # Calulate the learning rate for training
    steps_per_epoch = len(training_dataset)
    lrs, lr_drops = calulate_learning_rate(opts, steps_per_epoch)
    current_lr = lrs.pop(0)
    next_drop = lr_drops.pop(0)

    # Create the resnet model
    image, label = create_inputs(builder, opts)

    # Get the popart session options
    options = get_options(opts)

    # Get the device to run on
    device = get_device(opts.num_ipus, opts.simulation)

    # Create the training session
    proto, loss, argmax, outputs = create_model(builder, opts, image, label)

    (training_session, training_anchors) = init_session(
        proto, [loss],
        device,
        dataFlow=popart.DataFlow(opts.batches_per_step, outputs),
        options=options,
        training=True,
        optimizer=popart.SGD({
            "defaultLearningRate": (current_lr, False),
            "defaultWeightDecay": (opts.weight_decay, True)
        }),
        gcpLogDir=opts.gc_profile_log_dir)

    if not opts.no_validation:
        # Create the validation session

        (validation_session, validation_anchors) = init_session(
            proto, [loss],
            device,
            dataFlow=popart.DataFlow(opts.batches_per_step, outputs),
            options=options,
            training=False,
            gcpLogDir=opts.gc_profile_log_dir)

    # Copy weights and optimization parameters onto the device
    training_session.weightsFromHost()
    training_session.optimizerFromHost()

    batch_losses = deque(maxlen=opts.steps_per_log)
    batch_accs = deque(maxlen=opts.steps_per_log)
    batch_run_duration = deque(maxlen=opts.steps_per_log)
    total_samples = 0

    validation_losses = deque(maxlen=opts.steps_per_log)
    validation_accs = deque(maxlen=opts.steps_per_log)

    # Iterations
    for e in range(opts.epochs):

        # Set the timing start point for training
        training_start_point = time.time()

        print("Executing epoch ", e)
        for step, data in enumerate(training_dataset):

            total_steps = (e * steps_per_epoch) + step
            epoch = e + (step / steps_per_epoch)

            # Follow Learning Rate Schedule
            if total_steps > next_drop:
                current_lr = lrs.pop(0)
                if len(lr_drops) > 0:
                    next_drop = lr_drops.pop(0)
                else:
                    next_drop = np.inf
                training_session.updateOptimizer(
                    popart.SGD({"defaultLearningRate": (current_lr, False)}))
                training_session.optimizerFromHost()
                print("Learning_rate change to {}".format(current_lr))

            images = data[0]
            labels = data[1]

            stepio = popart.PyStepIO({
                image: images,
                label: labels
            }, training_anchors)

            # Train
            with Timer() as t1:
                training_session.run(stepio)

            batch_run_duration.append(t1.interval())

            # Get the loss and 'learnt' labels
            # - Sum the losses across replication & batch size
            nll_loss_anch = training_anchors["loss"]
            arg_max_anch = training_anchors[argmax]

            batch_losses.append(nll_loss_anch)
            batch_accs.append(100 * np.mean(arg_max_anch == labels))

            total_samples += (opts.batches_per_step * opts.batch_size)

            if not total_steps % opts.steps_per_log or total_steps == 0:

                training_duration = time.time() - training_start_point

                print_format = ("step: {step:6d}, epoch: {epoch:6.2f}, "
                                "lr: {lr:6.2g}, loss: {loss:6.3f}, "
                                "accuracy: {train_acc:6.3f}%, "
                                "img/sec: {img_per_sec:6.2f} "
                                "step_time: {duration:6.2f} sec "
                                "ipu_execution_time: {run_duration:6.2f}")

                stats = {
                    'step': total_steps,
                    'epoch': epoch,
                    'lr': current_lr,
                    'loss': np.mean(batch_losses),
                    'train_acc': np.mean(batch_accs),
                    'img_per_sec': total_samples / training_duration,
                    'duration': training_duration,
                    'run_duration': np.mean(batch_run_duration),
                }

                print(print_format.format(**stats))

                # Reset the metrics
                batch_accs.clear()
                batch_losses.clear()
                batch_run_duration.clear()
                total_samples = 0

                # Reset the training start point
                training_start_point = time.time()

        # Evaluation
        if not opts.no_validation:

            # The name of the onnx file we will created with current state
            # of the training and use to validate with the validation session.
            onnx_file_name = "ckpt.onnx"

            training_session.modelToHost(onnx_file_name)

            # Copy weights and optimization parameters onto the device
            validation_session.resetHostWeights(onnx_file_name)
            validation_session.weightsFromHost()

            validation_start_point = time.time()
            for validation_data in validation_dataset:

                validation_images = validation_data[0]
                validation_labels = validation_data[1]

                validation_stepio = popart.PyStepIO(
                    {
                        image: validation_images,
                        label: validation_labels
                    }, validation_anchors)

                validation_session.run(validation_stepio)

                # Get the loss and 'predicted' labels

                validation_nll_loss_anch = validation_anchors["loss"]
                validation_arg_max_anch = validation_anchors[argmax]

                validation_losses.append(validation_nll_loss_anch)
                validation_accs.append(
                    100 *
                    np.mean(validation_arg_max_anch == validation_labels))

            print("Validation accuracy epoch {:6.2f}, img/sec:{:6.2f} "
                  "accuracy: {:6.3f}% loss: {:6.3f}".format(
                      epoch, (len(validation_dataset) * opts.batch_size *
                              opts.batches_per_step /
                              (time.time() - validation_start_point)),
                      np.mean(validation_accs), np.mean(validation_losses)))

            training_session.resetHostWeights(onnx_file_name)

            # Write the training weights to the device
            training_session.weightsFromHost()
            training_session.optimizerFromHost()

    # Save the popart training report
    if opts.gc_profile_log_dir is not None:
        from gcprofile import save_popart_report
        save_popart_report(training_session)
Exemplo n.º 10
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      doProfiling=False,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 2
    shape_d0 = [batchSize, 2, 4, 4]
    shape_l0 = [batchSize]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [batchSize, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs, tilesPerIpu=20))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        shape_l0.insert(0, batchesPerStep)
    data = np.random.uniform(low=-10.0, high=10.0,
                             size=shape_d0).astype(np.float32)
    classes = np.prod(shape_d0) / (batchSize * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
Exemplo n.º 11
0
def run_py(proto: onnx.ModelProto,
           data: Mapping[str, np.ndarray],
           outputs: Optional[Union[str, Iterable[str]]],
           loss: Optional[Union[popart.Loss, Iterable[popart.Loss]]] = None,
           optimizer: Optional[popart.Optimizer] = None,
           return_stats: bool = False,
           log_dir: Optional[str] = None,
           ipus: Optional[int] = None,
           batches_per_step: int = 1,
           user_options: Optional[Mapping[str, Any]] = None):
    outputs = make_tuple(outputs)
    if loss is not None:
        loss = make_tuple(loss)
    # Setting up the Session
    data_flow = popart.DataFlow(
        batches_per_step,
        {output: popart.AnchorReturnType("ALL")
         for output in outputs})

    if user_options is None:
        user_options = {}
    options = popart.SessionOptions()
    options.enableGroupedMatmuls = False
    options.enableStochasticRounding = False
    options.constantWeights = True
    options.outlineThreshold = 10.0
    options.reportOptions = {"showVarStorage": "true"}
    if ipus is not None and ipus > 1:
        options.virtualGraphMode = popart.VirtualGraphMode.Manual
    else:
        ipus = 1
    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }
    for key, value in user_options.items():
        setattr(options, key, value)

    if ipus is not None:
        options.enableVirtualGraphs = False
    else:
        ipus = 1
    if return_stats:
        options.engineOptions = {
            "debug.allowOutOfMemory": "true",
            "debug.instrument": "true"
        }

    request_ipus = pow(2, math.ceil(math.log2(ipus)))
    device = popart.DeviceManager().acquireAvailableDevice(request_ipus)
    if device is None:
        raise Exception("Failed to acquire IPU.")

    print("Compiling graph")
    if optimizer is not None:
        session = popart.TrainingSession(fnModel=proto,
                                         deviceInfo=device,
                                         dataFeed=data_flow,
                                         userOptions=options,
                                         losses=loss,
                                         optimizer=optimizer)
    else:
        session = popart.InferenceSession(fnModel=proto,
                                          deviceInfo=device,
                                          dataFeed=data_flow,
                                          userOptions=options)

    # Compile the Poplar Graph. If it fails, return the memory stats
    try:
        session.prepareDevice()
    except popart.session.PrepareDeviceException as e:
        if return_stats:
            if log_dir:
                import gcprofile
                os.makedirs(log_dir, exist_ok=True)
                reports = gcprofile.save_popart_report(session,
                                                       log_dir=log_dir,
                                                       exception=e)
                graph_report = json.loads(reports["graph"])
            else:
                graph_report = json.loads(e.getGraphReport())
            max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
            total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
            raise e
        else:
            raise e
    print("Compilation complete")

    session.weightsFromHost()
    if optimizer is not None:
        session.optimizerFromHost()
    session.setRandomSeed(1984)

    anchors = session.initAnchorArrays()

    # Add a batches_per_step dimension if needed
    if batches_per_step > 1:
        data = {
            k: np.repeat(v[np.newaxis], batches_per_step, 0)
            for k, v in data.items()
        }

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    with tempfile.TemporaryDirectory() as tmp:
        file_path = os.path.join(tmp, "model.onnx")
        session.modelToHost(file_path)
        post_proto = onnx.load(file_path)

    # Release device
    device.detach()

    if return_stats:
        if log_dir:
            import gcprofile
            os.makedirs(log_dir, exist_ok=True)
            reports = gcprofile.save_popart_report(session, log_dir=log_dir)
            graph_report = json.loads(reports["graph"])
            exec_report = json.loads(reports["execution"])
        else:
            graph_report = json.loads(session.getGraphReport())
            exec_report = json.loads(session.getExecutionReport())
        max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
        total_memory = np.sum(graph_report["memory"]["byTile"]["total"])
        cycles = exec_report["simulation"]["cycles"]
        return (anchors[output] for output in outputs
                ), post_proto, total_memory, max_tile_memory, cycles
    return (anchors[output] for output in outputs), post_proto
def get_model_anchors_model2(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             returnRawInput=False,
                             labelArray=None):

    np.random.seed(1234)
    builder = popart.Builder()
    micro_batch_size = batch_size // gradAcclFactor

    shape_d0 = [micro_batch_size, 2, 4, 4]
    shape_l0 = [batch_size]
    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0), "inp")
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0, "weights")

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugPrefix="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [micro_batch_size, 32])
    out = builder.aiOnnx.softmax([r0], axis=1, debugPrefix="sfm")

    label_shape = [micro_batch_size]
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", label_shape),
                                "label")
    nll = builder.aiGraphcore.nllloss([out, l0])

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art, s0: art, c0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() +
                       popart.reservedGradientPrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor

    if doSharding is False:
        numIPUs = 1
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIPUs = 3
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 1)
        builder.virtualGraph(c0, 1)
        builder.virtualGraph(r0, 2)
        builder.virtualGraph(out, 2)
        builder.virtualGraph(nll, 2)

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=nll,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    classes = np.prod(shape_d0) / (micro_batch_size * batchesPerStep)
    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        label = label.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        shape_d0.insert(0, outer_dim)
    data = np.ones(shape=shape_d0).astype(np.float32)

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)

    session.weightsFromHost()

    for i in range(6):
        session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
def get_model_anchors_model1(doSharding,
                             doPipelining,
                             batchesPerStep,
                             doTraining,
                             doGradAccl=False,
                             gradAcclFactor=1,
                             doProfiling=False,
                             doDevicex=True,
                             anchorRestoredTensors=False,
                             labelArray=None):
    micro_batch_size = batch_size // gradAcclFactor
    builder = popart.Builder()

    input_shape = [micro_batch_size, hidden_size]
    input_ = builder.addInputTensor(popart.TensorInfo("FLOAT", input_shape))

    x = input_
    with builder.virtualGraph(0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_0_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(1 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_1_{i}")
            x = builder.aiOnnx.matmul([x, w])
    with builder.virtualGraph(2 if doSharding else 0):
        for i in range(2):
            w = builder.addInitializedInputTensor(
                np.ones([hidden_size, hidden_size]).astype(np.float32),
                f"weight_2_{i}")
            if i == 1: w0 = w
            x = builder.aiOnnx.matmul([x, w])
        label = builder.addInputTensor("INT32", [micro_batch_size])
        x = builder.aiGraphcore.nllloss([x, label])

    output = x

    builder.addOutputTensor(output)

    art = popart.AnchorReturnType("All")
    anchor_map = {x: art, w0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + x] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + x] = art
            anchor_map[popart.reservedRestoredPrefix() + w0] = art
        if doGradAccl is True:
            anchor_map[popart.reservedAcclToUpdatePrefix() +
                       popart.reservedGradientPrefix() + w0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradAccl
    opts.accumulationFactor = gradAcclFactor
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual

    if doSharding is False:
        numIPUs = 1
    else:
        numIPUs = 3

    if doTraining is True:
        session = popart.TrainingSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            loss=output,
            optimizer=popart.ConstSGD(0.01),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))
    else:
        session = popart.InferenceSession(
            fnModel=builder.getModelProto(),
            dataFlow=popart.DataFlow(batchesPerStep, anchor_map),
            userOptions=opts,
            deviceInfo=tu.create_test_device(numIpus=numIPUs))

    if doDevicex is False:
        return None

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    outer_dim = 1
    if batchesPerStep > 1:
        # Add an outer dimension of batchesPerStep. We repeat the labels
        # as we want consistency if we have different shape inputs between examples.
        outer_dim *= batchesPerStep
        labelArray = np.repeat(labelArray[np.newaxis], batchesPerStep, 0)
    if gradAcclFactor > 1:
        # Divide up the batches per step batches into gradAcclFactor * batchesPerStep
        # samples.
        outer_dim *= gradAcclFactor
        labelArray = labelArray.reshape([gradAcclFactor * batchesPerStep, -1])
    if outer_dim > 1:
        # Add the gradAcclFactor * batchesPerStep dimension into the input.
        input_shape = [outer_dim] + input_shape

    stepio = popart.PyStepIO(
        {
            input_: np.ones(input_shape, np.float32),
            label: labelArray.astype(np.int32)
        }, anchors)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    return anchors
Exemplo n.º 14
0
def main(argv):
    FLAGS = flags.FLAGS
    FLAGS.samples_per_device = int(FLAGS.batch_size / FLAGS.replication_factor)

    proto, data, outputs, output_id = graph_builder()
    print(f"Model: {FLAGS.model_name}")
    if not FLAGS.synthetic:
        print(f"Data_dir: {FLAGS.data_dir}")
    else:
        print(f"Using synthetic data")
    print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}")
    print(f"num_workers: {FLAGS.num_workers}")
    print(f"batches per step: {FLAGS.batches_per_step}")
    dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs)
    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if FLAGS.synthetic:
        options.ignoreData = True
    options.engineOptions = {
        "debug.instrument": "true" if FLAGS.profile else "false",
        "target.syncMethod": "polling"
    }
    # Select a device
    deviceManager = popart.DeviceManager()
    device = deviceManager.acquireAvailableDevice(1)
    print(f"{device}\n")
    if device is None:
        raise Exception("Not enough IPUs available.")

    session = popart.InferenceSession(fnModel=proto,
                                      deviceInfo=device,
                                      dataFeed=dataFlow,
                                      userOptions=options)

    print("Compiling...")
    start = time.time()
    try:
        session.prepareDevice()
    except popart.PrepareDeviceException as e:
        import gcprofile
        gcprofile.save_popart_report(session, exception=e)
        sys.exit(1)
    compilation_duration = time.time() - start
    print("Time to compile: {:.3f} seconds\n".format(compilation_duration))

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()
    # Copy weights and optimisation parameters onto the device
    session.weightsFromHost()

    def report_time(duration, data_duration=None, compute_duration=None):
        report_string = "Total {:<8.3} sec.".format(duration)
        if data_duration:
            report_string += "   Preprocessing {:<8.3} sec ({:4.3}%).".format(
                data_duration, 100 * (data_duration / duration))
        if compute_duration:
            report_string += "   Compute {:<8.3} sec ({:4.3}%).".format(
                compute_duration, 100 * (compute_duration / duration))
        report_string += "   {:5f} images/sec.".format(
            int(FLAGS.batch_size * FLAGS.batches_per_step / duration))
        print(report_string)

    print("Executing...")
    average_batches_per_sec = 0

    # Run
    start = time.time()
    durations = []
    if FLAGS.synthetic:
        for i in range(FLAGS.iterations):
            stepio = popart.PyStepIO(data, anchors)
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
    else:
        for d in data:
            stepio = popart.PyStepIO(d, anchors)
            # Calc data duration
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)

    if FLAGS.profile:
        import gcprofile
        return gcprofile.save_popart_report(session)
Exemplo n.º 15
0
def get_model_anchors(doSharding,
                      doPipelining,
                      batchesPerStep,
                      doTraining,
                      replicated_graph_count=1,
                      doProfiling=False,
                      doDropout=False,
                      doGradientAccl=False,
                      acclSteps=1,
                      doDevicex=True,
                      anchorRestoredTensors=False,
                      returnRawInput=False):
    np.random.seed(seed=1)

    builder = popart.Builder()
    batchSize = 16
    microBatchSize = batchSize // acclSteps

    shape_d0 = [microBatchSize, 2, 4, 4]
    shape_l0 = [microBatchSize]

    d0 = builder.addInputTensor(popart.TensorInfo("FLOAT", shape_d0))
    data_w0 = np.ones(shape=[2, 2, 3, 3]).astype(np.float32)
    w0 = builder.addInitializedInputTensor(data_w0)
    l0 = builder.addInputTensor(popart.TensorInfo("INT32", shape_l0))

    s0 = builder.aiOnnx.sin([d0], "s0")
    e0 = builder.aiOnnx.exp([s0], "e0")
    c0 = builder.aiOnnx.conv([e0, w0],
                             dilations=[1, 1],
                             pads=[1, 1, 1, 1],
                             strides=[1, 1],
                             debugContext="c0")
    r0 = builder.reshape_const(builder.aiOnnx, [c0], [microBatchSize, 32])
    if doDropout:
        do0 = builder.aiOnnx.dropout([r0], num_outputs=1, ratio=0.2)[0]
        out = builder.aiOnnx.softmax([do0], axis=1, debugContext="sfm")
    else:
        out = builder.aiOnnx.softmax([r0], axis=1, debugContext="sfm")
    nll = builder.aiGraphcore.nllloss([out, l0],
                                      reduction=popart.ReductionType.Sum)

    art = popart.AnchorReturnType("All")

    anchor_map = {nll: art, w0: art, e0: art}
    if doTraining is True:
        anchor_map[popart.reservedGradientPrefix() + d0] = art
        if doPipelining is True and anchorRestoredTensors is True:
            anchor_map[popart.reservedRestoredPrefix() + e0] = art
            anchor_map[d0] = art
            anchor_map[popart.reservedRestoredPrefix() + d0] = art

    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.enablePipelining = doPipelining
    opts.enableGradientAccumulation = doGradientAccl
    opts.accumulationFactor = acclSteps
    opts.enableStochasticRounding = False

    if doSharding is False:
        numIpus = 1 * replicated_graph_count
    else:
        opts.virtualGraphMode = popart.VirtualGraphMode.Manual
        numIpus = 2 * replicated_graph_count
        builder.virtualGraph(s0, 0)
        builder.virtualGraph(e0, 0)
        builder.virtualGraph(c0, 0)
        builder.virtualGraph(r0, 1)
        if doDropout:
            builder.virtualGraph(do0, 1)
        builder.virtualGraph(out, 1)
        builder.virtualGraph(nll, 1)

    if replicated_graph_count > 1:
        opts.replicatedGraphCount = replicated_graph_count
        opts.enableReplicatedGraphs = True

    device = tu.create_test_device(numIpus=numIpus)

    if doTraining is True:
        session = popart.TrainingSession(fnModel=builder.getModelProto(),
                                         dataFlow=popart.DataFlow(
                                             batchesPerStep, anchor_map),
                                         loss=nll,
                                         optimizer=popart.ConstSGD(0.01),
                                         userOptions=opts,
                                         deviceInfo=device)
    else:
        session = popart.InferenceSession(fnModel=builder.getModelProto(),
                                          dataFlow=popart.DataFlow(
                                              batchesPerStep, anchor_map),
                                          userOptions=opts,
                                          deviceInfo=device)

    if doDevicex is False:
        return None

    session.prepareDevice()
    anchors = session.initAnchorArrays()
    session.setRandomSeed(0)

    classes = np.prod(shape_d0) // (batchSize * batchesPerStep)

    label = np.random.randint(low=0, high=classes,
                              size=shape_l0).astype(np.int32)

    # With all options enabled return anchors are of the shape:
    # [batches_per_step, accl_factor, repl_factor, micro_batch, *data_shape]
    if acclSteps > 1:
        shape_d0.insert(0, acclSteps)
        label = label.reshape([acclSteps, -1])
    if batchesPerStep > 1:
        shape_d0.insert(0, batchesPerStep)
        label = np.repeat(label[np.newaxis], batchesPerStep, 0)

    data = np.random.random_sample(shape_d0).astype(np.float32)

    # This is a slightly odd case - we want the same data to be input for both
    # replicated graphs, but the dimension we need to repeat on is either the
    # first or second (the replication dimension) depending on whether we
    # have gradient accumulation enabled.
    # If we are not testing, this is a lot simpler as we can split samples however
    # we want.
    if replicated_graph_count > 1:
        if acclSteps > 1:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 2)
            label = label.reshape([replicated_graph_count, -1])
        else:
            data = np.repeat(data[np.newaxis], replicated_graph_count, 1)
            label = label.reshape([replicated_graph_count, -1])

    inputs = {d0: data, l0: label}
    stepio = popart.PyStepIO(inputs, anchors)
    stepio.enableRuntimeAsserts(False)

    session.weightsFromHost()

    session.run(stepio)

    if doProfiling is True:
        from gcprofile import save_popart_report
        save_popart_report(session)

    if returnRawInput is True:
        anchors["input_raw"] = data

    return anchors
Exemplo n.º 16
0
def main(argv):
    FLAGS = flags.FLAGS
    print(f"micro batch size is {FLAGS.micro_batch_size}")
    print(f"batch size is {FLAGS.batch_size}")
    print(f"batches_per_step is {FLAGS.batches_per_step}")
    proto, data, outputs, output_id = graph_builder()
    print(f"Model: {FLAGS.model_name}")
    if not FLAGS.synthetic:
        print(f"Data_dir: {FLAGS.data_dir}")
    else:
        print(f"Using synthetic data")
    print(f"Data_sub_dir for this process: {FLAGS.data_sub_dir}")
    print(f"num_workers: {FLAGS.num_workers}")
    print(f"batches per step: {FLAGS.batches_per_step}")
    dataFlow = popart.DataFlow(FLAGS.batches_per_step, outputs)

    # Create a session to compile and execute the graph
    options = popart.SessionOptions()
    if FLAGS.synthetic:
        options.syntheticDataMode = popart.SyntheticDataMode.Zeros
    options.instrumentWithHardwareCycleCounter = FLAGS.report_hw_cycle_count

    # Configure precision of convolutions and MatMuls
    if FLAGS.half_partials:
        options.convolutionOptions = {'partialsType': 'half'}
        options.partialsTypeMatMuls = "half"

    # Select a device
    deviceManager = popart.DeviceManager()
    device = deviceManager.acquireAvailableDevice(1)
    print(f"{device}\n")
    if device is None:
        raise Exception("Not enough IPUs available.")

    session = popart.InferenceSession(fnModel=proto,
                                      deviceInfo=device,
                                      dataFlow=dataFlow,
                                      userOptions=options)

    print("Compiling...")
    start = time.time()
    try:
        session.prepareDevice()
    except popart.PrepareDeviceException as e:
        import gcprofile
        gcprofile.save_popart_report(session, exception=e)
        sys.exit(1)
    compilation_duration = time.time() - start
    print("Time to compile: {:.3f} seconds\n".format(compilation_duration))

    # Create buffers to receive results from the execution
    anchors = session.initAnchorArrays()
    # Copy weights and optimisation parameters onto the device
    session.weightsFromHost()

    def report_time(duration, data_duration=None, compute_duration=None):
        report_string = "Total {:<8.3} sec.".format(duration)
        if data_duration:
            report_string += "   Preprocessing {:<8.3} sec ({:4.3}%).".format(
                data_duration, 100 * (data_duration / duration))
        if compute_duration:
            report_string += "   Compute {:<8.3} sec ({:4.3}%).".format(
                compute_duration, 100 * (compute_duration / duration))
        report_string += "   {:5f} images/sec.".format(
            int(FLAGS.micro_batch_size * FLAGS.batches_per_step / duration))
        print(report_string)
        if FLAGS.report_hw_cycle_count:
            print("Hardware cycle count per 'run':", session.getCycleCount())

    print("Executing...")
    average_batches_per_sec = 0

    # Run
    start = time.time()
    durations = []
    if FLAGS.synthetic:
        for i in range(FLAGS.iterations):
            stepio = popart.PyStepIO(data, anchors)
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
    else:
        for d in data:
            stepio = popart.PyStepIO(d, anchors)
            # Calc data duration
            data_time = time.time()
            data_d = data_time - start
            # Run compute
            session.run(stepio)
            # Calc compute duration
            results = anchors[output_id]
            comp_d = time.time() - data_time
            # Calc total duration
            t = time.time() - start
            report_time(t, data_d, comp_d)
            durations.append(t)
            start = time.time()
        duration = np.mean(durations)
Exemplo n.º 17
0
def train_process(opts):
    net = getattr(model, opts.model_name)(
        pretrained=False,
        progress=True,
        num_classes=10 if opts.dataset == "CIFAR-10" else 1000)

    # Models are missing a softmax layer to work with our NllLoss,
    # so we just add one on.
    net = nn.Sequential(net, nn.Softmax(dim=1))

    criterion = nn.NLLLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=opts.learning_rate,
                          momentum=opts.momentum,
                          weight_decay=opts.weight_decay)

    trainset, testset, trainloader, testloader = get_dataset(opts)

    inputs, labels = iter(trainloader).next()

    sessionOpts = get_options(opts)

    patterns = popart.Patterns()
    patterns.InPlace = opts.no_inplacing

    start = time.time()
    # Pass all the pytorch stuff to the session
    torchSession = popart.torch.TrainingSession(
        torchModel=net,
        inputs=inputs,
        targets=labels,
        optimizer=optimizer,
        losses=criterion,
        batch_size=opts.batch_size,
        batches_per_step=opts.batches_per_step,
        deviceInfo=get_device(opts.num_ipus, opts.simulation),
        userOptions=sessionOpts,
        passes=patterns)
    print("Converting pytorch model took {:.2f}s".format(time.time() - start))

    # Prepare for training.
    start = time.time()
    print("Compiling model...")
    anchors = torchSession.initAnchorArrays()

    torchSession.prepareDevice()
    torchSession.optimizerFromHost()
    torchSession.weightsFromHost()

    torchSession.setRandomSeed(0)
    print("Compiling popart model took {:.2f}s".format(time.time() - start))
    for epoch in range(opts.epochs):  # loop over the dataset multiple times
        run_training(opts, epoch, torchSession, trainloader, trainset, anchors)
        if (not opts.no_validation) and ((epoch + 1) % opts.valid_per_epoch
                                         == 0):
            run_validation(opts, epoch, torchSession, testloader, testset)

    print('Finished Training')
    # Save the popart training report
    if opts.gc_profile_log_dir is not None:
        from gcprofile import save_popart_report
        save_popart_report(torchSession)
Exemplo n.º 18
0
def main(args):

    # Model parameters
    np.random.seed(1971)
    input_rows = 28
    input_columns = 28
    num_classes = 10
    batch_size = 8
    input_shape = [batch_size, input_rows * input_columns]
    labels_shape = [batch_size]

    # Create model
    x0, labels, model_proto, anchor_map, loss = create_pipelined_model(
        num_features=input_columns * input_rows,
        num_classes=num_classes,
        batch_size=batch_size)

    # Save model (optional)
    if args.export:
        with open(args.export, 'wb') as model_path:
            model_path.write(model_proto)

    # Session options
    opts = popart.SessionOptions()
    opts.enablePipelining = False if args.no_pipelining else True
    opts.virtualGraphMode = popart.VirtualGraphMode.Manual
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.engineOptions = {"debug.instrument": "true"}
    pipeline_depth = 64
    num_ipus = 2

    # Create session
    session = popart.TrainingSession(
        fnModel=model_proto,
        dataFlow=popart.DataFlow(pipeline_depth, anchor_map),
        loss=loss,
        optimizer=popart.ConstSGD(0.01),
        userOptions=opts,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus))

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    # Extra data feed for pipeline
    if pipeline_depth > 1:
        labels_shape.insert(0, pipeline_depth)
        input_shape.insert(0, pipeline_depth)

    # Synthetic data input
    data_in = np.random.uniform(
        low=0.0, high=1.0, size=input_shape).astype(np.float32)

    labels_in = np.random.randint(
        low=0, high=num_classes, size=labels_shape).astype(np.int32)

    # Run session
    inputs = {x0: data_in, labels: labels_in}
    stepio = popart.PyStepIO(inputs, anchors)
    session.weightsFromHost()
    session.run(stepio)

    # Save report and return session object (optional)
    if args.report:
        from gcprofile import save_popart_report
        save_popart_report(session)
    if args.test:
        return session
Exemplo n.º 19
0
def main(args):

    # Model parameters
    np.random.seed(1971)
    input_rows = 28
    input_columns = 28
    num_classes = 10
    batch_size = 2048
    input_shape = [batch_size, input_rows * input_columns]
    labels_shape = [batch_size]

    # Create model
    x0, labels, model_proto, anchor_map, loss = create_model(
        num_features=input_columns * input_rows,
        num_classes=num_classes,
        batch_size=batch_size,
        force_recompute=True if args.recomputing == 'ON' else False)

    # Save model (optional)
    if args.export:
        with open(args.export, 'wb') as model_path:
            model_path.write(model_proto)

    # Session options
    num_ipus = 1
    opts = popart.SessionOptions()
    opts.reportOptions = {"showExecutionSteps": "true"}
    opts.engineOptions = {"debug.instrument": "true"}

    if args.recomputing == 'AUTO':
        opts.autoRecomputation = popart.RecomputationType.Standard

    # Create session
    session = popart.TrainingSession(
        fnModel=model_proto,
        dataFeed=popart.DataFlow(1, anchor_map),
        losses=[loss],
        optimizer=popart.ConstSGD(0.01),
        userOptions=opts,
        deviceInfo=popart.DeviceManager().acquireAvailableDevice(num_ipus))

    anchors = session.initAnchorArrays()
    session.prepareDevice()

    # Synthetic data input
    data_in = np.random.uniform(low=0.0, high=1.0,
                                size=input_shape).astype(np.float32)

    labels_in = np.random.randint(low=0, high=num_classes,
                                  size=labels_shape).astype(np.int32)

    # Run session
    inputs = {x0: data_in, labels: labels_in}
    stepio = popart.PyStepIO(inputs, anchors)
    session.weightsFromHost()
    session.optimizerFromHost()
    session.run(stepio)

    # Save report and return session object (optional)
    if args.report:
        from gcprofile import save_popart_report
        save_popart_report(session)
    if args.test:
        return session