Пример #1
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10*1024*1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False):
    """Builds ipu_options"""

    profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE,
                          "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
                          "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
                          "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE}

    config = utils.create_ipu_config(max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
                                     merge_infeed_io_copies=merge_infeed_io_copies,
                                     always_rearrange_copies_on_the_host=False,
                                     profiling=profile is not None,
                                     profile_execution=profile_exec_modes[profile] if profile else None)

    if "GCL_REAL_COLLECTIVES" in os.environ:
        config = utils.set_gcl_options(config, num_io_tiles=128, gcl_options={"useGclCollectives": "true", })

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas*shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(config, {
        "device.clearAtomicFlagAfterExchange": "false",
        "prng.enable": "true" if prng else "false",
        "target.deterministicWorkers": "false" if seed is None else "true",
    })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(config, {
            "availableMemoryProportion": str(availableMemoryProportion)
        })

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions,
                                                        oflo=fp_exceptions, esr=prng, nanoo=True)

    return config
Пример #2
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               xla_recompute=False,
               seed=None,
               profile=False,
               availableMemoryProportion=None):
    """Builds ipu_options"""
    config = utils.create_ipu_config(
        max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size,
        merge_infeed_io_copies=merge_infeed_io_copies,
        always_rearrange_copies_on_the_host=False,
        profiling=profile,
        profile_execution=profile)
    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas * shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(
        config, {
            "device.clearAtomicFlagAfterExchange": "false",
            "prng.enable": "true" if prng else "false",
            "target.deterministicWorkers": "false" if seed is None else "true",
        })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(
            config,
            {"availableMemoryProportion": str(availableMemoryProportion)})

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=fp_exceptions,
                                                        div0=fp_exceptions,
                                                        oflo=fp_exceptions,
                                                        esr=prng,
                                                        nanoo=True)

    return config
Пример #3
0
def get_ipu_config(fp_exceptions=True,
                   stochastic_rounding=True,
                   xla_recompute=False,
                   available_memory_proportion=None,
                   disable_graph_outlining=False,
                   num_ipus_required=0,
                   max_cross_replica_sum_buffer_size=0,
                   scheduler_selection='',
                   compile_only=False,
                   partials_type="half"):
    """Builds ipu_options"""
    config = utils.create_ipu_config(
        max_report_size=3001819596000,
        merge_infeed_io_copies=True,
        always_rearrange_copies_on_the_host=False,
        selection_order=utils.SelectionOrder.AUTO,
        disable_graph_outlining=disable_graph_outlining,
        max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size,
        scheduler_selection=scheduler_selection)

    config = utils.auto_select_ipus(config, num_ipus_required)

    config = utils.set_matmul_options(config, clear_pass_type=True)

    if available_memory_proportion is not None:
        config = utils.set_convolution_options(
            config, {
                "availableMemoryProportion": str(available_memory_proportion),
                "partialsType": partials_type
            })
        config = utils.set_matmul_options(
            config, {
                "availableMemoryProportion": str(available_memory_proportion),
                "partialsType": partials_type
            })

    config = utils.set_norm_options(config, use_stable_statistics=True)

    config = utils.set_recomputation_options(config,
                                             allow_recompute=xla_recompute)

    if compile_only:
        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=2,
            enable_remote_buffers=True)

    config = utils.set_floating_point_behaviour_options(
        config,
        inv=fp_exceptions,
        div0=fp_exceptions,
        oflo=fp_exceptions,
        esr=stochastic_rounding,
        nanoo=fp_exceptions)
    return config
Пример #4
0
def generic_train_graph(opts, is_training):
    data_type = 'float32'
    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_train = get_synthetic_dataset(opts)
        else:
            dataset_train = get_dataset_embed(opts, is_training=True)

        infeed_train = ipu_infeed_queue.IPUInfeedQueue(dataset_train, feed_name = 'DIN_dataset_infeed_train', replication_factor = (opts['replicas']))

        with ipu_scope('/device:IPU:0'):
            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(opts, uid_embedding, mid_embedding, cat_embedding, placeholders['learning_rate'], uids, mids, cats, mid_his, cat_his, mid_mask, target, seqlen, use_negsampling=False)

                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body, [tf.constant(0, getattr(np, 'float32'))] * 3, infeed_train)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [x / opts['batches_per_step'] for x in outputs_train]
            outfeed = None

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()

    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(ipu_options,
                                                 combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options, allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_train = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.compat.v1.Session(graph=train_graph)

    return GraphOps(sess,
                    init,
                    ops_train,
                    placeholders,
                    infeed_train,
                    outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Пример #5
0
def run_language_model(opts):
    if opts.random_seed is not None:
        utils.reset_ipu_seed(opts.random_seed)

    # Setup and acquire an IPU device:
    logging.info("Acquiring devices")
    if not opts.pipeline:
        opts.num_shards = 1  # FIX-ME enable sparse models using multiple shards

    # Make sure that no matter the number of shards/stages required, we always
    # acquire a power of 2 ipus (else attachment will fail)
    k = 0
    while 2**k < opts.num_shards:
        k += 1
    num_ipus = 2**k
    logger.info(f"Need {opts.num_shards} IPUs, requesting {num_ipus}")
    config = utils.create_ipu_config()

    if opts.compile_only:
        if opts.compile_only_ipu_version is None:
            raise AttributeError(
                "Must provide --compile-only-ipu-version if --compile-only is set."
            )

        config = utils.set_ipu_connection_type(
            config,
            utils.DeviceConnectionType.NEVER,
            ipu_version=opts.compile_only_ipu_version,
            enable_remote_buffers=True)

    config = utils.auto_select_ipus(config, num_ipus)
    config = utils.set_recomputation_options(config,
                                             allow_recompute=opts.recompute)
    # Enable stochastic rounding
    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=False,
                                                        div0=False,
                                                        oflo=False,
                                                        esr=True,
                                                        nanoo=False)
    config = sparse.set_system_config(
        config, custom_op_debug_printing=opts.debug_dense_grad)
    utils.configure_ipu_system(config)

    transformer = DynsparseTransformer(opts)
    if opts.mode in ["all", "train"]:
        run_training(opts, transformer)

    if opts.mode in ["all", "test"]:
        run_testing(opts, transformer)
Пример #6
0
def get_config(fp_exceptions,
               xla_recompute,
               disable_graph_outlining,
               num_required_ipus,
               enable_stochastic_rounding,
               max_cross_replica_sum_buffer_size,
               scheduler_selection,
               compile_only,
               ipu_id):

    # Builds ipu_options
    config = utils.create_ipu_config(
        merge_infeed_io_copies=True,
        always_rearrange_copies_on_the_host=False,
        disable_graph_outlining=disable_graph_outlining,
        selection_order=utils.SelectionOrder.AUTO,
        scheduler_selection=scheduler_selection
    )

    if ipu_id:
        config = utils.select_ipus(config, [ipu_id])
    else:
        config = utils.auto_select_ipus(config, num_required_ipus)

    config = utils.set_recomputation_options(
        config, allow_recompute=xla_recompute)
    # simple way to skip the big `Transpose` operation due to bad allocation
    # config = utils.set_matmul_options(config, clear_pass_type=True)
    config = utils.set_norm_options(config, use_stable_statistics=True)
    config = utils.set_floating_point_behaviour_options(
        config,
        inv=fp_exceptions,
        div0=fp_exceptions,
        oflo=fp_exceptions,
        esr=enable_stochastic_rounding,
        nanoo=fp_exceptions)
    config = utils.set_optimization_options(
        config,
        merge_remote_buffers=True,
        max_cross_replica_sum_buffer_size=max_cross_replica_sum_buffer_size)

    # Do not acquire a device, compile only.
    if compile_only:
        config = utils.set_ipu_connection_type(
            config, utils.DeviceConnectionType.NEVER, ipu_version=2, enable_remote_buffers=True)

    return config
Пример #7
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10 * 1024 * 1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None,
               limitVertexState=None):
    """Builds ipu_options"""

    profile_exec_modes = {
        "NO_PROFILE": ExecutionProfileType.NO_PROFILE,
        "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
        "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
        "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE
    }

    config = utils.create_ipu_config(
        merge_infeed_io_copies=merge_infeed_io_copies,
        always_rearrange_copies_on_the_host=False,
        profiling=profile is not None,
        profile_execution=profile_exec_modes[profile] if profile else None)

    config = utils.set_optimization_options(
        config,
        max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size)

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas * shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(
        config, {
            "device.clearAtomicFlagAfterExchange": "false",
            "prng.enable": "true" if prng else "false",
            "target.deterministicWorkers":
            "false" if seed is None else "portable",
        })

    if internalExchangeOptimisationTarget is not None:
        utils.set_compilation_options(
            config, {
                "opt.internalExchangeOptimisationTarget":
                internalExchangeOptimisationTarget
            })

    if limitVertexState is not None:
        config = utils.set_compilation_options(
            config, {
                "opt.limitVertexStateToLower256K":
                "true" if limitVertexState else "false"
            })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(
            config,
            {"availableMemoryProportion": str(availableMemoryProportion)})

    if half_partials:
        config = utils.set_convolution_options(config,
                                               {"partialsType": 'half'})
        config = utils.set_matmul_options(config, {"partialsType": 'half'})

    if conv_dithering:
        config = utils.set_convolution_options(config,
                                               {"enableConvDithering": "true"})

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config,
                                                        inv=fp_exceptions,
                                                        div0=fp_exceptions,
                                                        oflo=fp_exceptions,
                                                        esr=prng,
                                                        nanoo=True)

    return config
Пример #8
0
def generic_infer_graph(opts, is_training):
    data_type = 'float32'
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.compat.v1.placeholder(data_type,
                                                                 shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, is_training, seed)

        if opts['use_synthetic_data']:
            dataset_val = get_synthetic_dataset(opts)
        else:
            dataset_val = get_dataset_embed(opts, is_training=False)

        infeed_val = ipu_infeed_queue.IPUInfeedQueue(
            dataset_val,
            feed_name='DIN_dataset_infeed_val',
            replication_factor=(opts['replicas']))

        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="DIN_validation_outfeed",
            replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn_validate():
                def body(uids, mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen):
                    prob, loss_total, _, accuracy, _ = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        use_negsampling=False)
                    outfeed_op = outfeed_queue.enqueue(
                        (prob, target, accuracy))
                    return outfeed_op

                return loops.repeat(opts['batches_per_step'], body, [],
                                    infeed_val)

            outputs_val = ipu_compiler.compile(comp_fn_validate, [])
            outfeed = outfeed_queue.dequeue()

        saver = tf.compat.v1.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.compat.v1.global_variables_initializer()
    if opts['use_ipu_model']:
        os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"
    ipu_options = utils.create_ipu_config()
    ipu_options = utils.set_optimization_options(
        ipu_options, combine_embedding_lookups=True)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    if seed is not None:
        utils.reset_ipu_seed(seed)

    ops_val = [outputs_val]

    sess = tf.compat.v1.Session(graph=infer_graph)

    return GraphOps(sess, init, ops_val, placeholders, infeed_val, outfeed,
                    saver), uid_embedding, mid_embedding, cat_embedding
Пример #9
0
def get_config(prng=False,
               ipu_id=-1,
               shards=1,
               number_of_replicas=1,
               max_cross_replica_buffer_size=10*1024*1024,
               merge_infeed_io_copies=True,
               fp_exceptions=True,
               half_partials=False,
               conv_dithering=False,
               xla_recompute=False,
               seed=None,
               profile=None,
               availableMemoryProportion=None,
               stable_norm=False,
               internalExchangeOptimisationTarget=None):
    """Builds ipu_options"""

    profile_exec_modes = {"NO_PROFILE": ExecutionProfileType.NO_PROFILE,
                          "TILE_PROFILE": ExecutionProfileType.TILE_PROFILE,
                          "DEVICE_PROFILE": ExecutionProfileType.DEVICE_PROFILE,
                          "IPU_PROFILE": ExecutionProfileType.IPU_PROFILE}

    config = utils.create_ipu_config(merge_infeed_io_copies=merge_infeed_io_copies,
                                     always_rearrange_copies_on_the_host=False,
                                     profiling=profile is not None,
                                     profile_execution=profile_exec_modes[profile] if profile else None)

    config = utils.set_optimization_options(config,
                                            max_cross_replica_sum_buffer_size=max_cross_replica_buffer_size)

    if "GCL_REAL_COLLECTIVES" in os.environ:
        # The GCL_NUM_IO_TILES environment variable sets how many tiles in the IPU are reserved for Graphcore Communication Library (GCL) collectives.
        iotiles = int(os.environ['GCL_NUM_IO_TILES'])
        if iotiles % 2 or iotiles < 32 or iotiles > 192:
            raise ValueError(
                'GCL IO Tiles must be a multiple of 2 in between 32 and 192.'.format(iotiles))

        config = utils.set_gcl_options(config, num_io_tiles=iotiles, gcl_options={
                                       "useGclCollectives": "true", })

    if ipu_id == -1:
        config = utils.auto_select_ipus(config, number_of_replicas*shards)
    else:
        config = utils.select_ipus(config, [ipu_id])
    config = utils.set_compilation_options(config, {
        "device.clearAtomicFlagAfterExchange": "false",
        "prng.enable": "true" if prng else "false",
        "target.deterministicWorkers": "false" if seed is None else "portable",
    })

    if internalExchangeOptimisationTarget is not None:
        utils.set_compilation_options(config, {
            "opt.internalExchangeOptimisationTarget": internalExchangeOptimisationTarget
        })

    if availableMemoryProportion is not None:
        config = utils.set_convolution_options(config, {
            "availableMemoryProportion": str(availableMemoryProportion)
        })

    if half_partials:
        config = utils.set_convolution_options(config, {
            "partialsType": 'half'
        })
        config = utils.set_matmul_options(config, {
            "partialsType": 'half'
        })

    if conv_dithering:
        config = utils.set_convolution_options(config, {
            "enableConvDithering": "true"
        })

    if stable_norm:
        config = utils.set_norm_options(config, use_stable_statistics=True)

    if xla_recompute:
        utils.set_recomputation_options(config, allow_recompute=True)

    config = utils.set_floating_point_behaviour_options(config, inv=fp_exceptions, div0=fp_exceptions,
                                                        oflo=fp_exceptions, esr=prng, nanoo=True)

    return config
Пример #10
0
def generic_graph(opts):
    data_type = get_tf_datatype(opts)
    graph = tf.Graph()
    with graph.as_default():
        placeholders = {}
        placeholders["learning_rate"] = tf.placeholder(data_type, shape=[])
        uid_embedding, mid_embedding, cat_embedding = id_embedding(
            opts, True, opts['seed'])
        if opts['use_synthetic_data']:
            dataset = get_synthetic_dataset(opts, return_neg=True)
            feed_dict_values = {}
        else:
            dataset, feed_dict_values = get_dataset_embed_from_tensors(
                opts, data_type)
        infeed = ipu_infeed_queue.IPUInfeedQueue(
            dataset,
            feed_name='DIEN_dataset_infeed',
            replication_factor=(opts['replicas']))

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_loss, total_aux_loss, total_accuracy, uids,
                         mids, cats, mid_his, cat_his, mid_mask, target,
                         seqlen, noclk_mids, noclk_cats):
                    prob, loss, aux_loss, accuracy, grad_op = graph_builder(
                        opts,
                        uid_embedding,
                        mid_embedding,
                        cat_embedding,
                        placeholders['learning_rate'],
                        uids,
                        mids,
                        cats,
                        mid_his,
                        cat_his,
                        mid_mask,
                        target,
                        seqlen,
                        noclk_mids,
                        noclk_cats,
                        use_negsampling=True)
                    with tf.control_dependencies([grad_op]):
                        return total_loss + loss, total_aux_loss + aux_loss, total_accuracy + accuracy

                return loops.repeat(opts['batches_per_step'], body,
                                    [tf.constant(0, data_type)] * 3, infeed)

            outputs_train = ipu_compiler.compile(comp_fn, [])
            avg_loss, avg_aux_loss, avg_accuracy = [
                x / opts['batches_per_step'] for x in outputs_train
            ]

        saver = tf.train.Saver()
        utils.move_variable_initialization_to_cpu()
        init = tf.global_variables_initializer()
        if opts['use_ipu_model']:
            os.environ["TF_POPLAR_FLAGS"] = "--use_ipu_model"

    ipu_options = utils.create_ipu_config(
        profiling=False,
        profile_execution=False,
        max_cross_replica_sum_buffer_size=10000000,
        max_inter_ipu_copies_buffer_size=10000000)
    ipu_options = utils.set_recomputation_options(ipu_options,
                                                  allow_recompute=True)
    ipu_options = utils.auto_select_ipus(ipu_options, [opts['replicas']])
    utils.configure_ipu_system(ipu_options)
    utils.reset_ipu_seed(opts['seed'])

    graph_outputs = [avg_loss, avg_aux_loss, avg_accuracy]
    sess = tf.Session(graph=graph)

    return GraphOps(
        sess, init, graph_outputs, placeholders, infeed, saver,
        feed_dict_values), uid_embedding, mid_embedding, cat_embedding
Пример #11
0
    def pipeline_on_ipu(stages,
                        inputs_fn,
                        input_values,
                        repeat_count,
                        gradient_accumulation_count,
                        dataset_fn,
                        optimizer,
                        test_wrapper,
                        expected_max_tile_memory,
                        recomp,
                        schedule,
                        device_mapping=None,
                        batch_serialization_iterations=1):

        g = ops.Graph()
        with g.as_default(), test_wrapper.test_session(graph=g) as session:
            dataset = dataset_fn()
            inputs = inputs_fn()
            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
                dataset, next_feed_id())
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

            with variable_scope.variable_scope("ipu",
                                               use_resource=True,
                                               reuse=False):

                def optimizer_function(loss):
                    return pipelining_ops.OptimizerFunctionOutput(
                        optimizer, loss)

                def my_net(*args):
                    return pipelining_ops.pipeline(
                        stages,
                        gradient_accumulation_count,
                        repeat_count=repeat_count,
                        batch_serialization_iterations=
                        batch_serialization_iterations,
                        inputs=args,
                        optimizer_function=optimizer_function,
                        infeed_queue=infeed_queue,
                        outfeed_queue=outfeed_queue,
                        pipeline_schedule=schedule,
                        device_mapping=device_mapping)

            with ops.device("/device:IPU:0"):
                compiled_model_pipeline = ipu_compiler.compile(my_net,
                                                               inputs=inputs)

            # Execution profiles of code with dynamic control flow are not supported
            # on real HW.
            profiling = utils.running_on_ipu_model()
            cfg = utils.create_ipu_config(profiling=profiling,
                                          profile_execution=profiling)
            cfg = utils.set_ipu_model_options(cfg,
                                              compile_ipu_code=True,
                                              tiles_per_ipu=128)
            num_ipus = get_num_ipus(device_mapping) if device_mapping else 4
            cfg = utils.auto_select_ipus(cfg, num_ipus)
            if recomp:
                cfg = utils.set_recomputation_options(cfg,
                                                      allow_recompute=True)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            outfeed_op = outfeed_queue.dequeue()
            report = tu.ReportJSON(test_wrapper,
                                   session,
                                   configure_device=False)

            session.run(variables.global_variables_initializer())
            session.run(infeed_queue.initializer)
            report.reset()
            session.run(compiled_model_pipeline,
                        feed_dict=dict(zip(inputs, input_values)))
            out = session.run(outfeed_op)[0]
            if profiling:
                report.parse_log()
                if not device_mapping:
                    device_mapping = [
                        i - (i % 4) + ((i % 4) if (i % 4) < 2 else 5 - (i % 4))
                        for i in range(len(stages))
                    ]
                report.assert_pipeline_stages_on_expected_ipu(device_mapping)
                report.assert_max_tile_memory(expected_max_tile_memory,
                                              tolerance=0.3)
            return out
Пример #12
0
    def _sharded_on_ipu(stages, inputs_fn, input_values, repeat_count,
                        num_batches_to_accumulate, dataset_fn, optimizer,
                        test_wrapper, recomp, device_mapping):

        g = ops.Graph()
        with g.as_default(), test_wrapper.test_session(graph=g) as session:
            dataset = dataset_fn()
            inputs = inputs_fn()
            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
                dataset, next_feed_id())
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(next_feed_id())

            with variable_scope.variable_scope("ipu_sharded",
                                               use_resource=True,
                                               reuse=False):
                if device_mapping is None:
                    device_mapping = range(len(stages))

                def pipeline(*args):
                    outputs = args
                    for i, stage in zip(device_mapping, stages):
                        with scopes.ipu_shard(i):
                            outputs = stage(
                                *functional_ops._convert_to_list(outputs))  # pylint: disable=W0212
                    loss = outputs
                    enqueue_op = outfeed_queue.enqueue(loss)
                    opt = gradient_accumulation_optimizer.GradientAccumulationOptimizer(
                        optimizer, num_batches_to_accumulate)
                    outs = list(args[:len(args) -
                                     infeed_queue.number_of_tuple_elements])
                    outs.append(enqueue_op)
                    outs.append(opt.minimize(loss))
                    return outs

                def my_net(*args):
                    return loops.repeat(num_batches_to_accumulate,
                                        pipeline,
                                        inputs=args,
                                        infeed_queue=infeed_queue)

            with ops.device("/device:IPU:0"):
                compiled_model_pipeline = ipu_compiler.compile(my_net,
                                                               inputs=inputs)

            outfeed_op = outfeed_queue.dequeue()

            # Execution profiles of code with dynamic control flow are not supported on real HW
            profiling = utils.running_on_ipu_model()

            cfg = utils.create_ipu_config(profiling=profiling,
                                          profile_execution=profiling)
            cfg = utils.set_ipu_model_options(cfg,
                                              compile_ipu_code=True,
                                              tiles_per_ipu=128)
            num_ipus = get_num_ipus(device_mapping) if device_mapping else 4
            cfg = utils.auto_select_ipus(cfg, num_ipus)
            if recomp:
                cfg = utils.set_recomputation_options(cfg,
                                                      allow_recompute=True)
            utils.configure_ipu_system(cfg)
            utils.move_variable_initialization_to_cpu()

            session.run(variables.global_variables_initializer())
            session.run(infeed_queue.initializer)
            for _ in range(repeat_count):
                session.run(compiled_model_pipeline,
                            feed_dict=dict(zip(inputs, input_values)))
            return session.run(outfeed_op)