示例#1
0
 def override_log(key, old, new):
     logging.info(
         'Replacing {} for test type: {}. Old value:'
         ' ({}), New value: ({})'.format(
             key,
             stringify_args(
                 [ttype, test['eval-type'], jtype, stype],
                 joiner='.'), stringify_args(listify(old)),
             stringify_args(listify(new))))
示例#2
0
 def _raise(desc, inp, nameref, shape):
     logger = logging.getLogger(__name__)
     logger.debug('{} array for driver kernel {} does not '
                  'match expected shape (from array {}).  '
                  'Expected: ({}), got: ({})'.format(
                     desc, inp.name, nameref,
                     stringify_args(inp.shape),
                     stringify_args(shape))
                  )
     raise InvalidInputSpecificationException(inp.name)
示例#3
0
 def __init__(self, bad_inputs):
     from pyjac.utils import stringify_args, listify
     self.message = (
         'Inputs: ({}) were incorrectly, or conflictingly specified. '
         'See debug output for more information'.format(
             stringify_args(listify(bad_inputs))))
     super(InvalidInputSpecificationException, self).__init__(self.message)
示例#4
0
    def __internal_validator(self,
                             field,
                             valuelist,
                             valid,
                             message,
                             necessary=True):
        valuelist = listify(valuelist)
        if six.callable(valid):
            badvals = [x for x in valuelist if not valid(x)]
        else:
            badvals = [x for x in valuelist if x not in valid]
        if badvals and necessary:
            args = (badvals, )
            if not six.callable(valid):
                args = (badvals, valid)

            self._error(
                field, message.format(*tuple(stringify_args(x) for x in args)))
示例#5
0
    def test_lockstep_driver(self):
        # get rate info
        rate_info = determine_jac_inds(self.store.reacs, self.store.specs,
                                       RateSpecialization.fixed)
        mod_test = get_run_source()

        for kind, loopy_opts in OptionLoopWrapper.from_get_oploop(
                self, do_ratespec=False, langs=get_test_langs(),
                do_vector=True, yield_index=True):

            # make namestore
            namestore = arc.NameStore(loopy_opts, rate_info)

            # kernel 1 - need the jacobian reset kernel
            reset = reset_arrays(loopy_opts, namestore)
            # kernel 2 - incrementer
            # make mapstore, arrays and kernel info
            mapstore = arc.MapStore(loopy_opts, namestore.phi_inds, None)

            # use arrays of 2 & 3 dimensions to test the driver's copying
            base_phi_shape = namestore.n_arr.shape
            P_lp, P_str = mapstore.apply_maps(namestore.P_arr,
                                              arc.global_ind)
            phi_lp, phi_str = mapstore.apply_maps(namestore.n_arr,
                                                  arc.global_ind,
                                                  arc.var_name)
            inputs = [P_lp.name, phi_lp.name]
            base_jac_shape = namestore.jac.shape
            jac_lp, jac_str = mapstore.apply_maps(namestore.jac,
                                                  arc.global_ind,
                                                  arc.var_name,
                                                  arc.var_name)
            outputs = [jac_lp.name]
            kernel_data = [P_lp, phi_lp, jac_lp]
            kernel_data.extend(arc.initial_condition_dimension_vars(
                loopy_opts, None))
            instructions = Template("""
                ${phi_str} = ${phi_str} + ${P_str} {id=0, dep=*}
                ${jac_str} = ${jac_str} + ${phi_str} {id=1, dep=0, nosync=0}
            """).safe_substitute(**locals())

            # handle atomicity
            can_vec, vec_spec = ic.get_deep_specializer(
                loopy_opts, atomic_ids=['1'])
            barriers = []
            if loopy_opts.depth:
                # need a barrier between the reset & the kernel
                barriers = [(0, 1, 'global')]

            inner_kernel = k_gen.knl_info(
                name='inner',
                instructions=instructions,
                mapstore=mapstore,
                var_name=arc.var_name,
                kernel_data=kernel_data,
                silenced_warnings=['write_race(0)', 'write_race(1)'],
                can_vectorize=can_vec,
                vectorization_specializer=vec_spec)

            # put it in a generator
            generator = k_gen.make_kernel_generator(
                loopy_opts, kernel_type=KernelType.dummy,
                name='inner_kernel', kernels=[reset, inner_kernel],
                namestore=namestore,
                input_arrays=inputs[:],
                output_arrays=outputs[:],
                is_validation=True,
                driver_type=DriverType.lockstep,
                barriers=barriers)

            # use a "weird" (non-evenly divisibly by vector width) test-size to
            # properly test the copy-in / copy-out
            test_size = self.store.test_size - 37
            if test_size <= 0:
                test_size = self.store.test_size - 1
                assert test_size > 0
            # and make
            with temporary_build_dirs() as (build, obj, lib):

                numpy_arrays = []

                def __save(shape, name, zero=False):
                    data = np.zeros(shape)
                    if not zero:
                        # make it a simple range
                        data.flat[:] = np.arange(np.prod(shape))
                    # save
                    myname = pjoin(lib, name + '.npy')
                    # need to split inputs / answer
                    np.save(myname, data.flatten('K'))
                    numpy_arrays.append(data.flatten('K'))

                # write 'data'
                import loopy as lp
                for arr in kernel_data:
                    if not isinstance(arr, lp.ValueArg):
                        __save((test_size,) + arr.shape[1:], arr.name,
                               arr.name in outputs)

                # and a parameter
                param = np.zeros((test_size,))
                param[:] = np.arange(test_size)

                # build code
                generator.generate(build,
                                   data_order=loopy_opts.order,
                                   data_filename='data.bin',
                                   for_validation=True)

                # write header
                write_aux(build, loopy_opts, self.store.specs, self.store.reacs)

                # generate wrapper
                pywrap(loopy_opts.lang, build,
                       obj_dir=obj, out_dir=lib,
                       ktype=KernelType.dummy,
                       file_base=generator.name,
                       additional_inputs=inputs[:],
                       additional_outputs=outputs[:])

                # and calling script
                test = pjoin(lib, 'test.py')

                inputs = utils.stringify_args(
                    [pjoin(lib, inp + '.npy') for inp in inputs], use_quotes=True)
                str_outputs = utils.stringify_args(
                    [pjoin(lib, inp + '.npy') for inp in outputs], use_quotes=True)

                num_threads = _get_test_input(
                    'num_threads', psutil.cpu_count(logical=False))
                with open(test, 'w') as file:
                    file.write(mod_test.safe_substitute(
                        package='pyjac_{lang}'.format(
                            lang=utils.package_lang[loopy_opts.lang]),
                        input_args=inputs,
                        test_arrays=str_outputs,
                        output_files=str_outputs,
                        looser_tols='[]',
                        loose_rtol=0,
                        loose_atol=0,
                        rtol=0,
                        atol=0,
                        non_array_args='{}, {}'.format(
                            test_size, num_threads),
                        kernel_name=generator.name.title(),))

                try:
                    utils.run_with_our_python([test])
                except subprocess.CalledProcessError:
                    logger = logging.getLogger(__name__)
                    logger.debug(utils.stringify_args(vars(loopy_opts), kwd=True))
                    assert False, 'lockstep_driver error'

                # calculate answers
                ns = base_jac_shape[1]
                # pressure is added to phi
                phi = numpy_arrays[1].reshape((test_size, ns),
                                              order=loopy_opts.order)
                p_arr = numpy_arrays[0]
                phi = phi + p_arr[:, np.newaxis]
                jac = numpy_arrays[2].reshape((test_size, ns, ns),
                                              order=loopy_opts.order)
                # and the diagonal of the jacobian has the updated pressure added
                jac[:, range(ns), range(ns)] += phi[:, range(ns)]
                # and read in outputs
                test = np.load(pjoin(lib, outputs[0] + '.npy')).reshape(
                    jac.shape, order=loopy_opts.order)
                assert np.array_equal(test, jac)
示例#6
0
 def __init__(self, otype, value, allowed):
     from pyjac.utils import stringify_args
     self.message = ('Value "{}" for override type "{}" is not allowed. '
                     'Allowed values are: {}'.format(
                         otype, value, stringify_args(allowed)))
     super(InvalidOverrideException, self).__init__(self.message)
示例#7
0
def get_test_matrix(work_dir,
                    test_type,
                    test_matrix,
                    for_validation,
                    raise_on_missing=True,
                    langs=get_test_langs()):
    """Runs a set of mechanisms and an ordered dictionary for
    performance and functional testing

    Parameters
    ----------
    work_dir : str
        Working directory with mechanisms and for data
    test_type: :class:`build_type.jacobian`
        Controls some testing options (e.g., whether to do a sparse matrix or not)
    test_matrix: str
        The test matrix file to load
    for_validation: bool
        If determines which test type to load from the test matrix,
        validation or performance
    raise_on_missing: bool
        Raise an exception of the specified :param:`test_matrix` file is not found
    langs: list of str
        The allowed languages, modifiable by the :envvar:`TEST_LANGS` or test_langs
        in :file:`test_setup.py`
    Returns
    -------
    mechanisms : dict
        A dictionary indicating which mechanism are available for testing,
        The structure is as follows:
            mech_name : {'mech' : file path to the Cantera mechanism
                         'ns' : number of species in the mechanism
                         'limits' : {'full': XXX, 'sparse': XXX}}: a dictionary of
                            limits on the number of conditions that can be evaluated
                            for this mechanism (full & sparse jacobian respectively)
                            due to memory constraints
    params  : OrderedDict
        The parameters to put in an oploop
    max_vec_width : int
        The maximum vector width to test

    """
    work_dir = abspath(work_dir)

    # validate the test matrix
    matrix_name = test_matrix
    test_matrix = build_and_validate('test_matrix_schema.yaml', test_matrix)

    # check that we have the working directory
    if not exists(work_dir):
        raise Exception('Work directory {} for '.format(work_dir) +
                        'testing not found, exiting...')

    # load the models
    models = load_models(work_dir, test_matrix)
    assert isinstance(test_type, build_type)

    # load tests
    tests = load_tests(test_matrix, matrix_name)
    # filter those that match the test type
    valid_str = 'validation' if for_validation else 'performance'
    tests = [test for test in tests if test['test-type'] == valid_str]
    tests = [
        test for test in tests
        if test['eval-type'] == enum_to_string(test_type)
        or test['eval-type'] == 'both'
    ]
    # and dictify
    tests = [OrderedDict(test) for test in tests]
    if not tests:
        raise Exception('No tests found in matrix {} for {} test of {}, '
                        'exiting...'.format(matrix_name, valid_str,
                                            enum_to_string(test_type)))

    # get defaults we haven't migrated to schema yet
    rate_spec = ['fixed', 'hybrid'] if test_type != build_type.jacobian \
        else ['fixed']
    sparse = ([
        enum_to_string(JacobianFormat.sparse),
        enum_to_string(JacobianFormat.full)
    ] if test_type == build_type.jacobian else
              [enum_to_string(JacobianFormat.full)])
    jac_types = [
        enum_to_string(JacobianType.exact),
        enum_to_string(JacobianType.finite_difference)
    ] if (test_type == build_type.jacobian
          and not for_validation) else [enum_to_string(JacobianType.exact)]
    split_kernels = [False]

    # and default # of cores, this may be overriden
    default_num_cores, can_override_cores = num_cores_default()

    # load platforms
    platforms = load_platforms(test_matrix,
                               langs=langs,
                               raise_on_empty=raise_on_missing)
    platforms = [OrderedDict(platform) for platform in platforms]
    out_params = []
    logger = logging.getLogger(__name__)
    for test in tests:
        # filter platforms
        plats = [p.copy() for p in platforms]
        if 'platforms' in test:
            plats = [
                plat for plat in plats if plat['platform'] in test['platforms']
            ]
            if len(plats) < len(platforms):
                logger.info(
                    'Platforms ({}) filtered out for test type: {}'.format(
                        ', '.join([
                            p['platform'] for p in platforms if p not in plats
                        ]), ' - '.join([test['test-type'],
                                        test['eval-type']])))
        if not len(plats):
            logger.warn('No platforms found for test {}, skipping...'.format(
                ' - '.join([test['test-type'], test['eval-type']])))
            continue

        for plookup in plats:
            clean = plookup.copy()
            # get default number of cores
            cores = default_num_cores[:]
            # get default vector widths
            widths = plookup['width']
            is_wide = widths is not None
            depths = plookup['depth']
            is_deep = depths is not None
            if is_deep and not is_wide:
                widths = depths[:]
            # sanity check
            if is_wide or is_deep:
                assert widths is not None
            # special gpu handling for cores
            is_gpu = False
            # test platform type
            if platform_is_gpu(plookup['platform']):
                # set cores to 1
                is_gpu = True
                cores = [1]

            def apply_vectypes(lookup,
                               widths,
                               is_wide=is_wide,
                               is_deep=is_deep):
                if is_wide or is_deep:
                    # set vec widths
                    use_par = None in widths or (is_wide and is_deep)
                    lookup['vecsize'] = [x for x in widths[:] if x is not None]
                    base = [True] if not use_par else [True, False]
                    if is_wide:
                        lookup['wide'] = base[:]
                        base.pop()
                    if is_deep:
                        lookup['deep'] = base[:]
                else:
                    lookup['vecsize'] = [None]
                    lookup['wide'] = [False]
                    lookup['deep'] = [False]
                del lookup['width']
                del lookup['depth']

            apply_vectypes(plookup, widths)

            # default is both conp / conv
            conp = [True, False]
            order = ['C', 'F']

            # loop over possible overrides
            oploop = OptionLoop(
                OrderedDict([('ttype', [enum_to_string(test_type)]),
                             ('jtype', jac_types), ('stype', sparse)]))
            for i, state in enumerate(oploop):
                ttype = state['ttype']
                jtype = state['jtype']
                stype = state['stype']

                def override_log(key, old, new):
                    logging.info(
                        'Replacing {} for test type: {}. Old value:'
                        ' ({}), New value: ({})'.format(
                            key,
                            stringify_args(
                                [ttype, test['eval-type'], jtype, stype],
                                joiner='.'), stringify_args(listify(old)),
                            stringify_args(listify(new))))

                # copy defaults
                icores = cores[:]
                iorder = order[:]
                iconp = conp[:]
                ivecsizes = widths[:] if widths is not None else [None]
                imodels = tuple(models.keys())
                # load overides
                overrides = get_overrides(test, ttype, jtype, stype)

                # check that we can apply
                if 'num_cores' in overrides and not can_override_cores:
                    raise InvalidTestEnivironmentException(
                        ttype, 'num_cores', matrix_name, 'num_threads')
                elif 'num_cores' in overrides and is_gpu:
                    logger = logging.getLogger(__name__)
                    logger.info(
                        'Discarding unused "num_cores" override for GPU '
                        'platform {}'.format(plookup['platform']))
                    del overrides['num_cores']

                # 'num_cores', 'order', 'conp', 'vecsize', 'vectype'
                # now apply overrides
                outplat = plookup.copy()
                for current in overrides:
                    ivectypes_override = None
                    for override in overrides:
                        if override == 'num_cores':
                            override_log('num_cores', icores,
                                         overrides[override])
                            icores = overrides[override]
                        elif override == 'order' and not is_gpu:
                            override_log('order', iorder, overrides[override])
                            iorder = overrides[override]
                        elif override == 'gpuorder' and is_gpu:
                            override_log('order', iorder, overrides[override])
                            iorder = overrides[override]
                        elif override == 'conp':
                            iconp_save = iconp[:]
                            iconp = []
                            if 'conp' in overrides[override]:
                                iconp.append(True)
                            if 'conv' in overrides[override]:
                                iconp.append(False)
                            override_log('conp', iconp_save, iconp)
                        elif override == 'vecsize' and not is_gpu:
                            override_log('vecsize', ivecsizes,
                                         overrides[override])
                            outplat['vecsize'] = listify(overrides[override])
                        elif override == 'gpuvecsize' and is_gpu:
                            override_log('gpuvecsize', ivecsizes,
                                         overrides[override])
                            outplat['vecsize'] = listify(overrides[override])
                        elif override == 'vectype' and not is_gpu:
                            # we have to do this at the end
                            ivectypes_override = overrides[override]
                        elif override == 'gpuvectype' and is_gpu:
                            ivectypes_override = overrides[override]
                        elif override == 'models':
                            # check that all models are valid
                            for model in overrides[override]:
                                if model not in imodels:
                                    raise InvalidOverrideException(
                                        override, model, imodels)
                            # and replace
                            override_log('models', stringify_args(imodels),
                                         stringify_args(overrides[override]))
                            imodels = tuple(overrides[override])

                    if ivectypes_override is not None:
                        c = clean.copy()
                        apply_vectypes(c,
                                       outplat['vecsize'],
                                       is_wide='wide' in ivectypes_override,
                                       is_deep='deep' in ivectypes_override)
                        # and copy into working
                        outplat['wide'] = c['wide'] if 'wide' in c else [False]
                        outplat['deep'] = c['deep'] if 'deep' in c else [False]
                        outplat['vecsize'] = c['vecsize']
                        old = ['']
                        if is_wide:
                            old += ['wide']
                        if is_deep:
                            old += ['deep']
                        elif not is_wide:
                            old += ['par']
                        override_log('vecsize', old, ivectypes_override)

                # and finally, convert back to an option loop format
                out_params.append(
                    [('num_cores', icores), ('order',
                                             iorder), ('rate_spec', rate_spec),
                     ('split_kernels', split_kernels), ('conp', iconp),
                     ('sparse', [stype]), ('jac_type',
                                           [jtype]), ('models', [imodels])] +
                    [(key, value) for key, value in six.iteritems(outplat)])

    max_vec_width = 1
    vector_params = [
        dict(p)['vecsize'] for p in out_params
        if 'vecsize' in dict(p) and dict(p)['vecsize'] != [None]
    ]
    if vector_params:
        max_vec_width = max(max_vec_width,
                            max([max(x) for x in vector_params]))
    from . import reduce_oploop
    loop = reduce_oploop(out_params)
    return models, loop, max_vec_width
示例#8
0
def get_driver(loopy_opts, namestore, inputs, outputs, driven,
               test_size=None):
    """
    Implements a driver function for kernel evaluation.
    This allows pyJac to utilize a smaller working-buffer (sized to the
    global work size), and implements a static(like) scheduling algorithm

    Notes
    -----
    Currently Loopy doesn't have the machinery to enable native calling of other
    loopy kernels, so we have to fudge this a bit (and this can't be used for
    unit-tests).  Future versions will allow us to natively wrap test functions
    (i.e., once the new function calling interface is in place in Loopy)

    :see:`driver-function` for more information

    Parameters
    ----------
    loopy_opts: :class:`loopy_options`
        The loopy options specifying how to create this kernel
    namestore: :class:`NameStore`
        The namestore class that owns our arrays
    inputs: list of :class:`lp.KernelArgument`
        The arrays that should be copied into internal working buffers
        before calling subfunctions
    outputs: list of :class:`lp.KernelArgument`
        The arrays should be copied back into global memory after calling
        subfunctions
    driven: :class:`kernel_generator`
        The kernel generator to wrap in the driver

    Returns
    -------
    knl_list : list of :class:`knl_info`
        The generated infos for feeding into the kernel generator

    """

    # we have to do some shennanigains here to get this to work in loopy:
    #
    # 1. Loopy currently doesn't allow you to alter the for-loop increment size,
    #    so for OpenCL where we must increment by the global work size, we have to
    #    put a dummy for-loop in, and teach the kernel generator to work around it
    #
    # 2. Additionally, the OpenMP target in Loopy is Coming Soon (TM), hence we need
    #    our own dynamic scheduling preamble for the driver loop (
    #    if we're operating in queue-form)
    #
    # 3. Finally, Loopy is just now supporting the ability to natively call other
    #    kernels, so for the moment we still need to utilize the dummy function
    #    calling we have set-up for the finite difference Jacobian

    # first, get our input / output arrays
    arrays = {}
    to_find = set(listify(inputs)) | set(listify(outputs))
    # create mapping of array names
    array_names = {v.name: v for k, v in six.iteritems(vars(namestore))
                   if isinstance(v, arc.creator) and not (
                    v.fixed_indicies or v.affine)}
    for arr in to_find:
        arr_creator = next((array_names[x] for x in array_names if x == arr), None)
        if arr_creator is None:
            continue
        arrays[arr] = arr_creator

    if len(arrays) != len(to_find):
        missing = to_find - set(arrays.keys())
        logger = logging.getLogger(__name__)
        logger.debug('Input/output arrays for queue_driver kernel {} not found.'
                     .format(stringify_args(missing)))
        raise InvalidInputSpecificationException(missing)

    def arr_non_ic(array_input):
        return len(array_input.shape) > 1

    # ensure the inputs and output are all identically sized (among those that have)
    # a non-initial condition dimension

    def __check(check_input):
        shape = ()

        def _raise(desc, inp, nameref, shape):
            logger = logging.getLogger(__name__)
            logger.debug('{} array for driver kernel {} does not '
                         'match expected shape (from array {}).  '
                         'Expected: ({}), got: ({})'.format(
                            desc, inp.name, nameref,
                            stringify_args(inp.shape),
                            stringify_args(shape))
                         )
            raise InvalidInputSpecificationException(inp.name)

        nameref = None
        desc = 'Input' if check_input else 'Output'
        for inp in [arrays[x] for x in (inputs if check_input else outputs)]:
            if not arr_non_ic(inp):
                # only the initial condition dimension, fine
                continue
            if shape:
                if inp.shape != shape and len(inp.shape) == len(shape):
                    # allow different shapes in the last index
                    if not all(x == y for x, y in zip(*(
                            inp.shape[:-1], shape[:-1]))):
                        _raise(desc, inp, nameref, shape)
                    # otherwise, take the maximum of the shape entry
                    shape = shape[:-1] + (max(shape[-1], inp.shape[-1]),)

                elif inp.shape != shape:
                    _raise(desc, inp, nameref, shape)
            else:
                nameref = inp.name
                shape = inp.shape[:]
        if not shape:
            logger = logging.getLogger(__name__)
            logger.debug('No {} arrays supplied to driver that require '
                         'copying to working buffer!'.format(desc))
            raise InvalidInputSpecificationException('Driver ' + desc + ' arrays')
        return shape

    def create_interior_kernel(for_input):
        shape = __check(for_input)
        name = 'copy_{}'.format('in' if for_input else 'out')
        # get arrays
        arrs = [arrays[x] for x in (inputs if for_input else outputs)]

        # create a dummy map and store
        map_shape = np.arange(shape[1], dtype=arc.kint_type)
        mapper = arc.creator(name, arc.kint_type, map_shape.shape, 'C',
                             initializer=map_shape)
        mapstore = arc.MapStore(loopy_opts, mapper, test_size)

        # determine what other inames we need, if any
        namer = UniqueNameGenerator(set([mapstore.iname]))
        extra_inames = []
        for i in six.moves.range(2, len(shape)):
            iname = namer(mapstore.iname)
            extra_inames.append((iname, '0 <= {} < {}'.format(
                iname, shape[i])))

        indicies = [arc.global_ind, mapstore.iname] + [
            ex[0] for ex in extra_inames]
        global_indicies = indicies[:]
        global_indicies[0] += ' + ' + driver_offset.name

        # bake in SIMD pre-split
        vec_spec = None
        split_spec = None
        conditional_index = get_problem_index(loopy_opts)

        def __build(arr, local, **kwargs):
            inds = global_indicies if not local else indicies
            if isinstance(arr, arc.jac_creator) and arr.is_sparse:
                # this is a sparse Jacobian, hence we have to override the default
                # indexing (as we're doing a straight copy)
                kwargs['ignore_lookups'] = True
            if arr_non_ic(arr):
                return mapstore.apply_maps(arr, *inds, **kwargs)
            else:
                return mapstore.apply_maps(arr, inds[0], **kwargs)

        # create working buffer version of arrays
        working_buffers = []
        working_strs = []
        for arr in arrs:
            arr_lp, arr_str = __build(arr, True, use_local_name=True)
            working_buffers.append(arr_lp)
            working_strs.append(arr_str)

        # create global versions of arrays
        buffers = []
        strs = []
        for arr in arrs:
            arr_lp, arr_str = __build(arr, False, reshape_to_working_buffer=False)
            buffers.append(arr_lp)
            strs.append(arr_str)

        # now create the instructions
        instruction_template = Template("""
            if ${ind} < ${problem_size} ${shape_check}
                ${local_buffer} = ${global_buffer} {id=copy_${name}}
            end
        """) if for_input else Template("""
            if ${ind} < ${problem_size} ${shape_check}
                ${global_buffer} = ${local_buffer} {id=copy_${name}}
            end
        """)

        warnings = []
        instructions = []
        for i, arr in enumerate(arrs):
            # get shape check
            shape_check = ''
            if arr.shape[-1] != shape[-1] and len(arr.shape) == len(shape):
                shape_check = ' and {} < {}'.format(
                    indicies[-1], arr.shape[-1])

            instructions.append(instruction_template.substitute(
                local_buffer=working_strs[i],
                global_buffer=strs[i],
                ind=conditional_index,
                problem_size=arc.problem_size.name,
                name=arr.name,
                shape_check=shape_check))
            warnings.append('write_race(copy_{})'.format(arr.name))
        if loopy_opts.is_simd:
            warnings.append('vectorize_failed')
            warnings.append('unrolled_vector_iname_conditional')
        instructions = '\n'.join(instructions)

        kwargs = {}
        if loopy_opts.lang == 'c':
            # override the number of copies in this function to 1
            # (i.e., 1 per-thread)
            kwargs['iname_domain_override'] = [(arc.global_ind, '0 <= {} < 1'.format(
                arc.global_ind))]

        priorities = ([arc.global_ind + '_outer'] if loopy_opts.pre_split else [
            arc.global_ind]) + [arc.var_name]
        # and return the kernel info
        return knl_info(name=name,
                        instructions=instructions,
                        mapstore=mapstore,
                        var_name=arc.var_name,
                        extra_inames=extra_inames,
                        kernel_data=buffers + working_buffers + [
                          arc.work_size, arc.problem_size, driver_offset],
                        silenced_warnings=warnings,
                        vectorization_specializer=vec_spec,
                        split_specializer=split_spec,
                        unrolled_vector=True,
                        loop_priority=set([tuple(priorities + [
                          iname[0] for iname in extra_inames])]),
                        **kwargs)

    copy_in = create_interior_kernel(True)
    # create a dummy kernel info that simply calls our internal function
    instructions = driven.name + '()'
    # create mapstore
    call_name = driven.name
    repeats = 1
    if loopy_opts.depth:
        # we need 'var_name' to have a non-unity size
        repeats = loopy_opts.vector_width

    map_shape = np.arange(repeats, dtype=arc.kint_type)
    mapper = arc.creator(call_name, arc.kint_type, map_shape.shape, 'C',
                         initializer=map_shape)
    mapstore = arc.MapStore(loopy_opts, mapper, test_size)
    mangler = lp_pregen.MangleGen(call_name, tuple(), tuple())
    kwargs = {}
    if loopy_opts.lang == 'c':
        # override the number of calls to the driven function in the driver, this
        # is currently fixed to 1 (i.e., 1 per-thread)
        kwargs['iname_domain_override'] = [(arc.global_ind, '0 <= {} < 1'.format(
            arc.global_ind))]

    func_call = knl_info(name='driver',
                         instructions=instructions,
                         mapstore=mapstore,
                         kernel_data=[arc.work_size, arc.problem_size],
                         var_name=arc.var_name,
                         extra_inames=copy_in.extra_inames[:],
                         manglers=[mangler],
                         **kwargs)
    copy_out = create_interior_kernel(False)

    # and return
    return [copy_in, func_call, copy_out]