Exemplo n.º 1
0
def writeLogic(outputPath, logicData, solutionWriter):
    print1("# Writing Library Logic")

    if not globalParameters["MergeFiles"]:
        ensurePath(os.path.join(outputPath, "Logic"))

    # Tensile.h
    h = ""
    h += "#pragma once\n"
    h += "#include \"TensileTypes.h\"\n"

    # TensileInternal.h
    ih = ""
    ih += "#include \"Tensile.h\"\n"
    ih += "#include \"SolutionHelper.h\"\n"
    if globalParameters["SolutionMapHash"]:
        ih += "#include <unordered_map>\n"
    else:
        ih += "#include <map>\n"
    ih += "#include <tuple>\n"

    # problem type Key
    problemSizeTemplate = "unsigned int, unsigned int, unsigned int"
    if globalParameters["RuntimeLanguage"] == "OCL":
        problemSizeTemplate += ", cl_command_queue"
    ih += "typedef std::tuple<%s> ProblemSizeKey;\n" \
        % (problemSizeTemplate)

    # hash function
    ih += "\n"
    ih += "size_t tensileProblemSizeHasher( const ProblemSizeKey & problemSize ) {\n"
    ih += "  size_t hash = 0;\n"
    ih += "  // ignore lowest 4 bits; keep next 21 bits\n"
    ih += "  size_t hash0 = (std::get<0>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size0\n"
    ih += "  size_t hash1 = (std::get<1>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of size1\n"
    ih += "  size_t hashU = (std::get<2>(problemSize) >> 4) & ((1<<22)-1); // 21 bits of sizeU\n"
    ih += "  // 21+21+21 = 63 bit hash\n"
    ih += "  hash |= hash0;\n"
    ih += "  hash |= hash1<<21;\n"
    ih += "  hash |= hashU<<42;\n"
    ih += "  return hash;\n"
    ih += "}\n"
    ih += "\n"

    # Tensile.cpp
    s = ""
    s += "#include \"Tensile.h\"\n"
    s += "#include \"TensileInternal.h\"\n"
    s += "#include \"Solutions.h\"\n"
    s += "#include \"SolutionMapper.h\"\n"

    ########################################
    # problemType
    for problemType in logicData:

        # function argument list
        argListSizes = solutionWriter.getArgList(problemType, False, False,
                                                 False)
        argListStream = solutionWriter.getArgList(problemType, False, False,
                                                  True)
        argListData = solutionWriter.getArgList(problemType, True, True, True)

        # declare tensile_ProblemType
        h += "\n// enqueue solution\n"
        h += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ");\n\n")

        # declare TensileSolutionPointer_ProblemType
        h += "\n// solution pointer\n"
        h += "typedef TensileStatus (*TensileSolutionPointer_%s)(\n" \
            % problemType
        for i in range(0, len(argListData)):
            h += "    %s %s%s" % (argListData[i][0], argListData[i][1], ",\n" \
                if i < len(argListData)-1 else ");\n\n")

        numSizes = problemType["TotalIndices"]
        h += "typedef ProblemSizes<%u, %u, %u> ProblemSizes_%s;\n" \
            % (numSizes, problemType["IndicesSummation"][-1], problemType["IndicesFree"][0], problemType)
        if 0:
            lastStrideC = problemType["NumIndicesC"]
            lastStrideA = len(problemType["IndexAssignmentsA"])
            lastStrideB = len(problemType["IndexAssignmentsB"])
            h += "typedef ProblemParms<%u, %u, %u, %u> ProblemSizes_%s;\n" % \
                  (lastStrideA, lastStrideB, lastStrideC, numSizes, problemType)

        # declare tensileGetSolutionPointer_ProblemType
        h += "\n// get solution pointer\n"
        h += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # declare tensileName_
        h += "// get solution name\n"
        h += "const char * tensileGetSolutionName_%s(\n" \
            % (problemType)
        for i in range(0, len(argListStream)):
            h += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ");\n\n")

        # get solution naming for problem type
        solutionsForProblemType = []
        for scheduleTuple in logicData[problemType]:
            solutionsForSchedule = scheduleTuple[2]
            for solution in solutionsForSchedule:
                if solution not in solutionsForProblemType:
                    solutionsForProblemType.append(solution)

        # solution names for problem type
        solutionNamesForProblemType = []
        for solution in solutionsForProblemType:
            solutionName = solutionWriter.getSolutionName(solution)
            solutionNamesForProblemType.append(solutionName)

        # reset problemType source
        if not globalParameters["MergeFiles"]:
            filePrefix = "Tensile_%s" % (problemType)
            s = "#include \"TensileTypes.h\"\n"
            s = "#include \"Tensile.h\"\n"
            s = "#include \"SolutionMapper.h\"\n"
            s += "#include \"TensileInternal.h\"\n"
            for solutionName in solutionNamesForProblemType:
                s += "#include \"%s.h\"\n" % solutionName

        ########################################
        # implement per-Schedule functions in source
        s += "/*******************************************************************************\n * Per-Schedule Functions\n *******************************************************************************/"
        for scheduleTuple in logicData[problemType]:

            # get logic parameters for problem type
            scheduleName = scheduleTuple[0]
            deviceNames = scheduleTuple[1]
            solutionsForSchedule = scheduleTuple[2]
            indexOrder = scheduleTuple[3]
            exactLogic = scheduleTuple[4]
            rangeLogic = scheduleTuple[5]

            # solution names for schedule
            solutionNamesForSchedule = []
            for solution in solutionsForSchedule:
                solutionName = solutionWriter.getSolutionName(solution)
                solutionNamesForSchedule.append(solutionName)

            s += "\n\n"
            schedProbName = "%s_%s" % (scheduleName, problemType)
            s += writeSolutionAndExactTable(schedProbName, problemType, \
                    solutionsForSchedule, solutionNamesForSchedule, exactLogic)

            # function tensileGetSolutionPointerUncached_Schedule_ProblemType
            s += "\n// problem size -> solution logic\n"
            s += "TensileSolutionPointer_%s tensileGetSolutionPointerUncached_%s(\n" \
                % (problemType, schedProbName)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")
            s += writeSolutionAssertionCheckHeader(problemType)

            exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \
                                            solutionsForSchedule, exactLogic, \
                                            solutionNamesForSchedule, True)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionsForSchedule, solutionNamesForSchedule, problemType, True)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

            # function tensileGetSolutionName_Schedule_ProblemType
            s += "\n// get solution name for problem size\n"
            s += "const char * tensileGetSolutionName_%s(\n" \
                % (schedProbName)
            for i in range(0, len(argListSizes)):
                s += "    %s %s%s" \
                    % (argListSizes[i][0], argListSizes[i][1], \
                    ",\n" if i < len(argListSizes)-1 else ") {\n\n")
            s += writeSolutionAssertionCheckHeader(problemType)

            exactLogicStr = writeExactLogic(schedProbName, problemType, indexOrder, \
                                            solutionsForSchedule, exactLogic, \
                                            solutionNamesForSchedule, False)
            if rangeLogic != None:
                rangeLogicStr = writeRangeLogicRec(0, indexOrder, rangeLogic, \
                    solutionsForSchedule, solutionNamesForSchedule, problemType, False)
            else:
                rangeLogicStr = "  return NULL; // none\n"
            s += "  /* exact mappings */\n"
            s += exactLogicStr
            s += "\n  /* range mappings */\n"
            s += rangeLogicStr
            s += "\n}\n"

        ########################################
        # implement problem-type functions in source
        s += "/*******************************************************************************\n * Per-ProblemType Functions\n *******************************************************************************/"

        if globalParameters["SolutionMapHash"]:
            ih += "typedef std::unordered_map<ProblemSizeKey, TensileSolutionPointer_%s, std::function<size_t (ProblemSizeKey)>> Map_%s;\n" \
                % (problemType, problemType )
        else:
            ih += "typedef std::map<ProblemSizeKey, TensileSolutionPointer_%s> Map_%s;\n" \
                % (problemType, problemType)

        ih += "extern Map_%s solutionMap_%s;\n" % (problemType, problemType)

        # implement tensileGetSolutionPointerUncached_ProblemType
        for ptr in [True, False]:
            returnType = "PointerUncached" if ptr else "Name"
            s += "\n// return solution %s\n" % returnType
            s += ("TensileSolutionPointer_%s " %
                  problemType) if ptr else "const char *"
            s += "tensileGetSolution%s_%s(\n" \
                % (returnType, problemType)
            for i in range(0, len(argListStream)):
                s += "    %s %s%s" \
                    % (argListStream[i][0], argListStream[i][1], \
                    ",\n" if i < len(argListStream)-1 else ") {\n")

            # choose from schedules based on device name


#     print logicData
            schedules = logicData[problemType]
            numSchedules = len(schedules)
            if numSchedules > 1:

                reordered_schedules = []
                for scheduleIdx in range(0, numSchedules):
                    schedule = schedules[scheduleIdx]
                    deviceNames = schedule[1]
                    if deviceNames != ["fallback"
                                       ] and deviceNames != ["Device 0000"]:
                        reordered_schedules.append(schedule)
                for scheduleIdx in range(0, numSchedules):
                    schedule = schedules[scheduleIdx]
                    deviceNames = schedule[1]
                    if deviceNames == ["fallback"
                                       ] or deviceNames == ["Device 0000"]:
                        reordered_schedules.append(schedule)

                # get device name
                if globalParameters["RuntimeLanguage"] == "OCL":
                    s += "get device name opencl;\n"
                else:
                    s += "\n//  get device name hip;\n"
                    s += "    int deviceId;\n"
                    s += "    hipGetDevice(&deviceId);\n"
                    s += "    hipDeviceProp_t deviceProperties;\n"
                    s += "    hipGetDeviceProperties(&deviceProperties, deviceId);\n"
                    s += "    std::string name = deviceProperties.name;\n"

                if problemType["DataType"].isDouble():
                    s += "\n"
                    s += "//  intercept schedule selection and call HIP (source) kernel\n"
                    s += "    if((strideA2K == 0) || (strideB2K == 0))\n"
                    s += "    {\n"
                    numSchedules = len(schedules)
                    schedule = reordered_schedules[numSchedules - 1]
                    scheduleName = schedule[0]
                    s += "        return tensileGetSolution%s_%s_%s(" \
                          % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
                    s += "\n"

                if problemType["DataType"].isHalf():
                    # "first" free index, usually the letter "I"
                    free0Index = problemType["IndicesFree"][0]
                    free0Char = globalParameters["IndexChars"][free0Index]
                    # "second" free index, usually the letter "J"
                    free1Index = problemType["IndicesFree"][1]
                    free1Char = globalParameters["IndexChars"][free1Index]
                    s += "\n"
                    s += "//  intercept schedule selection and call HIP (source) kernel\n"
                    s += "//  if either the summation size or the 'first' free index size\n"
                    s += "//  is odd or the 'second' free index size is 1\n"
                    s += "    if (((sizeL & 1) == 1) || ((size%s & 1) == 1)" % (
                        free0Char)
                    s += " || (size%s == 1))\n" % (free1Char)
                    s += "    {\n"
                    numSchedules = len(schedules)
                    schedule = reordered_schedules[numSchedules - 1]
                    scheduleName = schedule[0]
                    s += "        return tensileGetSolution%s_%s_%s(" \
                          % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
                    s += "\n"

                for scheduleIdx in range(0, numSchedules):
                    schedule = reordered_schedules[scheduleIdx]
                    scheduleName = schedule[0]
                    deviceNames = schedule[1]
                    if scheduleIdx > 0:
                        s += "    else "
                    if scheduleIdx < numSchedules - 1:
                        s += "if ("
                        for deviceNameIdx in range(0, len(deviceNames)):
                            deviceName = deviceNames[deviceNameIdx]
                            if deviceNameIdx > 0:
                                s += " || "
                            s += "name == \"%s\"" % deviceName
                        s += ")"
                    s += "\n    {\n"
                    s += "        return tensileGetSolution%s_%s_%s(" \
                        % ( returnType, scheduleName, problemType)
                    for i in range(0, len(argListSizes)):
                        s += "%s%s" \
                            % (argListSizes[i][1],
                                ", " if i < len(argListSizes)-1 else ");\n")
                    s += "    }\n"
            else:  # == 1
                schedule = schedules[0]
                scheduleName = schedule[0]
                s += "  return tensileGetSolution%s_%s_%s(" \
                    % ( returnType, scheduleName, problemType)
                for i in range(0, len(argListSizes)):
                    s += "%s%s" \
                        % (argListSizes[i][1],
                            ", " if i < len(argListSizes)-1 else ");\n")
            s += "\n}\n"

        # implement tensileGetSolutionPointer_ProblemType
        s += "\n// return solution pointer; user calls it\n"
        s += "Map_%s solutionMap_%s%s;\n" % (
            problemType, problemType, "(1024, tensileProblemSizeHasher)"
            if globalParameters["SolutionMapHash"] else "")
        s += "TensileSolutionPointer_%s tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "    %s %s%s" \
                % (argListStream[i][0], argListStream[i][1], \
                ",\n" if i < len(argListStream)-1 else ") {\n")
        # create key
        s += "  ProblemSizeKey key = std::make_tuple( size%s, size%s, size%s%s );\n" \
            % ( \
            globalParameters["IndexChars"][problemType["Index0"]], \
            globalParameters["IndexChars"][problemType["Index1"]], \
            globalParameters["IndexChars"][problemType["IndexUnroll"]], \
            ", stream" if globalParameters["RuntimeLanguage"] == "OCL" else "")
        # check for key in map
        s += "  static std::mutex findKernelMutex;\n"
        s += "  std::lock_guard<std::mutex> findKernelLock(findKernelMutex);\n"
        s += "  Map_%s::iterator iter = solutionMap_%s.find(key);\n" \
            % (problemType, problemType)
        s += "  if (iter != solutionMap_%s.end()) {\n" % problemType
        s += "    return iter->second;\n"
        s += "  } else {\n"
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointerUncached_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], "," if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    solutionMap_%s[key] = ptr;\n" % problemType
        s += "    return ptr;\n"
        s += "  }\n"
        s += "}\n"

        # declare tensile_ProblemType
        s += "\n// main call to solution; enqueues a kernel\n"
        s += "TensileStatus tensile_%s(\n" % problemType
        for i in range(0, len(argListData)):
            s += "    %s %s%s" \
                % (argListData[i][0], argListData[i][1], \
                ",\n" if i < len(argListData)-1 else ") {\n")
        s += "    TensileSolutionPointer_%s ptr = tensileGetSolutionPointer_%s(\n" \
            % (problemType, problemType)
        for i in range(0, len(argListStream)):
            s += "        %s%s" \
                % (argListStream[i][1], ", " if i < len(argListStream)-1 else ");")
            s += "\n"
        s += "    if ( ptr ) {\n"
        s += "      return ptr("
        for i in range(0, len(argListData)):
            s += "%s%s" \
                % (argListData[i][1], ", " if i < len(argListData)-1 else ");\n")
        s += "    } else {\n"
        s += "      return tensileStatusFailure; // no solution found\n"
        s += "    }\n"
        s += "}\n"

        # open and close problemType files
        if not globalParameters["MergeFiles"]:
            logicSourceFile = open(os.path.join(outputPath, "Logic", \
                "%s.cpp" % filePrefix), "w")
            logicSourceFile.write(s)
            logicSourceFile.close()

    # close merged files
    if globalParameters["MergeFiles"]:
        logicSourceFile = open(os.path.join(outputPath, \
            "Tensile.cpp"), "w")
        logicSourceFile.write(s)
        logicSourceFile.close()

    logicHeaderFile = open(os.path.join(outputPath, \
        "Tensile.h"), "w")
    logicHeaderFile.write(h)
    logicHeaderFile.close()

    internalHeaderFile = open(os.path.join(outputPath, \
        "TensileInternal.h"), "w")
    internalHeaderFile.write(ih)
    internalHeaderFile.close()
Exemplo n.º 2
0
def writeClientParameters(forBenchmark, solutions, problemSizes, stepName, \
    functionList):
    h = ""

    ##############################################################################
    # Min Naming
    ##############################################################################
    if forBenchmark:
        kernels = []
        for solution in solutions:
            solutionKernels = solution.getKernels()
            for kernel in solutionKernels:
                if kernel not in kernels:
                    kernels.append(kernel)

        solutionSerialNaming = Solution.getSerialNaming(solutions)
        kernelSerialNaming = Solution.getSerialNaming(kernels)
        solutionMinNaming = Solution.getMinNaming(solutions)
        kernelMinNaming = Solution.getMinNaming(kernels)
        solutionWriter = SolutionWriter( \
            solutionMinNaming, solutionSerialNaming, \
            kernelMinNaming, kernelSerialNaming)

    if forBenchmark:
        if globalParameters["MergeFiles"]:
            h += "#include \"Solutions.h\"\n"
        else:
            for solution in solutions:
                solutionName = solutionWriter.getSolutionName(solution)
                h += "#include \"" + solutionName + ".h\"\n"
        h += "\n"
    else:
        h += "#include \"Tensile.h\"\n"

    h += "typedef enum {\n"
    h += "    enum_float,\n"
    h += "    enum_double,\n"
    h += "    enum_TensileComplexFloat,\n"
    h += "    enum_TensileComplexDouble\n"
    h += "#ifdef Tensile_ENABLE_HALF\n"
    h += "    ,enum_TensileHalf\n"
    h += "#endif\n"
    h += "} DataTypeEnum;\n"
    h += "\n"

    h += "// Debug Params\n"
    h += "const unsigned printTensorA=%x;\n" % int(
        globalParameters["PrintTensorA"])
    h += "const unsigned printTensorB=%x;\n" % int(
        globalParameters["PrintTensorB"])
    h += "const unsigned printTensorC=%x;\n" % int(
        globalParameters["PrintTensorC"])

    h += "const bool printWinnersOnly=%s;\n" % toCppBool(
        globalParameters["PrintWinnersOnly"])
    h += "\n"

    h += "const char indexChars[%u] = \"%s" \
        % (len(globalParameters["IndexChars"])+1, \
        globalParameters["IndexChars"][0])
    for i in range(1, len(globalParameters["IndexChars"])):
        h += globalParameters["IndexChars"][i]
    h += "\";\n"

    h += "unsigned int functionIdx;\n"
    h += "unsigned int dataTypeIdx;\n"
    h += "unsigned int problemTypeIdx;\n"
    h += "\n"

    ##############################################################################
    # Problem Types
    ##############################################################################
    #dataTypes = []
    #problemTypes = []
    #functionSerialToDataTypeAndIdx = []
    dataTypes = []
    problemTypes = []
    problemTypesForDataType = {}  # for data type
    schedulesForProblemType = {}  # for problem type
    functionInfo = [
    ]  # dataTypeIdx, problemTypeIdx, idxWithinDataType, idxWithinProblemType

    if forBenchmark:
        problemType = solutions[0]["ProblemType"]
        dataType = problemType["DataType"]
        dataTypes.append(dataType)
        problemTypes.append(problemType)
        problemTypesForDataType[dataType] = [problemType]
        schedulesForProblemType[problemType] = solutions
        numProblemTypes = 1
        for solution in solutions:
            functionInfo.append([0, 0, 0, 0, 0, 0])
    else:
        for functionIdx in range(0, len(functionList)):
            function = functionList[functionIdx]
            scheduleName = function[0]
            problemType = function[1]
            dataType = problemType["DataType"]
            if dataType not in dataTypes:
                dataTypes.append(dataType)
                problemTypesForDataType[dataType] = []
            if problemType not in problemTypesForDataType[dataType]:
                problemTypesForDataType[dataType].append(problemType)
                schedulesForProblemType[problemType] = []
            schedulesForProblemType[problemType].append(scheduleName)

        # sort
        dataTypes = sorted(dataTypes)
        for dataType in dataTypes:
            problemTypesForDataType[dataType] = \
                sorted(problemTypesForDataType[dataType])
            for problemType in problemTypesForDataType[dataType]:
                schedulesForProblemType[problemType] = \
                    sorted(schedulesForProblemType[problemType])

        # assign info
        functionIdxSerial = 0
        problemTypeIdxSerial = 0
        for dataTypeIdxSerial in range(0, len(dataTypes)):
            dataType = dataTypes[dataTypeIdxSerial]
            functionIdxForDataType = 0
            for problemTypeIdxForDataType in range(0, \
                len(problemTypesForDataType[dataType])):
                problemType = \
                    problemTypesForDataType[dataType][problemTypeIdxForDataType]
                problemTypes.append(problemType)
                functionIdxForProblemType = 0
                for functionIdxForProblemType in range(0, \
                    len(schedulesForProblemType[problemType])):
                    functionInfo.append([ \
                        dataTypeIdxSerial, \
                        problemTypeIdxForDataType, \
                        problemTypeIdxSerial, \
                        functionIdxSerial,\
                        functionIdxForDataType,\
                        functionIdxForProblemType, \
                        ])
                    functionIdxForProblemType += 1
                    functionIdxForDataType += 1
                    functionIdxSerial += 1
                problemTypeIdxSerial += 1
        numProblemTypes = problemTypeIdxSerial
        numFunctions = functionIdxSerial
        h += "const unsigned int numFunctions = %u;\n" % numFunctions

    ##############################################################################
    # Data Types
    ##############################################################################
    h += "/* data types */\n"
    numDataTypes = len(dataTypes)
    h += "const unsigned int numDataTypes = %u;\n" % numDataTypes
    h += "const DataTypeEnum dataTypeEnums[numDataTypes] = { enum_%s" \
        % dataTypes[0].toCpp()
    for dataTypeIdx in range(1, numDataTypes):
        h += ", enum_%s" % dataTypes[dataTypeIdx].toCpp()
    h += " };\n"
    # bytes per elements
    h += "const unsigned int bytesPerElement[numDataTypes] = { %u" \
        % (dataTypes[0].numBytes())
    for dataTypeIdx in range(1, numDataTypes):
        dataType = dataTypes[dataTypeIdx]
        h += ", %u" % dataType.numBytes()
    h += " };\n"
    # flops per mac
    h += "const unsigned int numFlopsPerMac[numDataTypes] = { %u" \
        % (2 if dataTypes[0].isReal() else 8)
    for dataTypeIdx in range(1, numDataTypes):
        dataType = dataTypes[dataTypeIdx]
        h += ", %u" % (2 if dataType.isReal() else 8)
    h += " };\n"
    for dataTypeIdx in range(0, numDataTypes):
        h += "#define Tensile_DATA_TYPE_%s\n" \
            % dataTypes[dataTypeIdx].toCpp().upper()

    ##############################################################################
    # Problem Types
    ##############################################################################
    h += "/* problem types */\n"
    h += "const unsigned int numProblemTypes = %u;\n" % numProblemTypes
    # Num C Indices
    h += "const unsigned int numIndicesC[numProblemTypes] = { %u" \
        % problemTypes[0]["NumIndicesC"]
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %u" % problemType["NumIndicesC"]
    h += " };\n"

    # Num AB Indices
    maxNumIndicesAB = len(problemTypes[0]["IndexAssignmentsA"])
    h += "const unsigned int numIndicesAB[numProblemTypes] = { %u" \
        % len(problemTypes[0]["IndexAssignmentsA"])
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        numIndicesAB = len(problemType["IndexAssignmentsA"])
        h += ", %u" % numIndicesAB
        maxNumIndicesAB = max(numIndicesAB, maxNumIndicesAB)
    h += " };\n"
    h += "const unsigned int maxNumIndicesAB = %u;\n" % maxNumIndicesAB
    # Index Assignments A
    h += "const unsigned int indexAssignmentsA[numProblemTypes][maxNumIndicesAB] = {\n"
    for problemTypeIdx in range(0, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        indices = problemType["IndexAssignmentsA"]
        h += "  { %u" % indices[0]
        for i in range(1, maxNumIndicesAB):
            if i < len(indices):
                h += ", %u" % indices[i]
            else:
                h += ", static_cast<unsigned int>(-1)"
        if problemTypeIdx < numProblemTypes - 1:
            h += " },\n"
        else:
            h += " }\n"
    h += "};\n"
    # Index Assignments B
    h += "const unsigned int indexAssignmentsB[numProblemTypes][maxNumIndicesAB] = {\n"
    for problemTypeIdx in range(0, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        indices = problemType["IndexAssignmentsB"]
        h += "  { %u" % indices[0]
        for i in range(1, maxNumIndicesAB):
            if i < len(indices):
                h += ", %u" % indices[i]
            else:
                h += ", static_cast<unsigned int>(-1)"
        if problemTypeIdx < numProblemTypes - 1:
            h += " },\n"
        else:
            h += " }\n"
    h += "};\n"
    # beta
    h += "bool useBeta[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["UseBeta"] else "false")
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true" if problemType["UseBeta"] else "false")
    h += " };\n"
    # Complex Conjugates
    h += "const bool complexConjugateA[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["ComplexConjugateA"] else "false" )
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true"
                       if problemTypes[0]["ComplexConjugateA"] else "false")
    h += " };\n"
    h += "const bool complexConjugateB[numProblemTypes] = { %s" \
        % ("true" if problemTypes[0]["ComplexConjugateB"] else "false" )
    for problemTypeIdx in range(1, numProblemTypes):
        problemType = problemTypes[problemTypeIdx]
        h += ", %s" % ("true"
                       if problemTypes[0]["ComplexConjugateB"] else "false")
    h += " };\n"
    h += "\n"

    if not forBenchmark:
        h += "// dataTypeIdxSerial, problemTypeIdxForDataType, problemTypeIdxSerial, functionIdxSerial, functionIdxForDataType, functionIdxForProblemType\n"
        first = True
        h += "const unsigned int functionInfo[numFunctions][6] = {\n"
        for info in functionInfo:
            h += "%s{ %u, %u, %u, %u, %u, %u }" % ("  " if first else ",\n  ", \
                info[0], info[1], info[2], info[3], info[4], info[5] )
            first = False
        h += " };\n"

    ##############################################################################
    # Problem Sizes
    ##############################################################################
    maxNumIndices = problemTypes[0]["TotalIndices"]
    if not forBenchmark:
        for problemType in problemTypes:
            maxNumIndices = max(problemType["TotalIndices"], maxNumIndices)
    h += "const unsigned int maxNumIndices = %u;\n" % maxNumIndices
    h += "const unsigned int totalIndices[numProblemTypes] = { %u" \
        % problemTypes[0]["TotalIndices"]
    for problemTypeIdx in range(1, numProblemTypes):
        h += ", %u" % problemTypes[problemTypeIdx]["TotalIndices"]
    h += " };\n"
    if forBenchmark:
        h += "const unsigned int numProblems = %u;\n" \
            % problemSizes.totalProblemSizes
        h += "const unsigned int problemSizes[numProblems][%u] = {\n" \
            % problemTypes[0]["TotalIndices"]
        for i in range(0, problemSizes.totalProblemSizes):
            line = "  {%5u" % problemSizes.sizes[i][0]
            for j in range(1, problemTypes[0]["TotalIndices"]):
                line += ",%5u" % problemSizes.sizes[i][j]
            line += " }"
            h += line
            if i < problemSizes.totalProblemSizes - 1:
                h += ","
            else:
                h += "};"
            h += "\n"
        h += "const unsigned int minStrides[%u] = {" \
            % problemTypes[0]["TotalIndices"]
        for i in range(0, len(problemSizes.minStrides)):
            if (i != 0):
                h += ", "
            h += str(problemSizes.minStrides[i])
        h += "};\n"
    else:
        h += "unsigned int userSizes[maxNumIndices];\n"
        h += "unsigned int minStrides[%u] = {" \
            % maxNumIndices
        for i in range(0, maxNumIndices):
            if (i != 0):
                h += ", "
            h += str(0)
            # always use 0 for minStrides in benchmark mode
        h += "};\n"

    if forBenchmark:
        h += "/* problem sizes */\n"
        """
    h += "const bool indexIsSized[maxNumIndices] = {"
    for i in range(0, problemSizes.totalIndices):
      h += " %s" % ("true" if problemSizes.indexIsSized[i] else "false")
      if i < problemSizes.totalIndices-1:
        h += ","
    h += " };\n"

    h += "const unsigned int numIndicesSized = %u;\n" \
        % len(problemSizes.indicesSized)
    h += "const unsigned int indicesSized[numIndicesSized][4] = {\n"
    h += "// { min, stride, stride_incr, max }\n"
    for i in range(0, len(problemSizes.indicesSized)):
      r = problemSizes.indicesSized[i]
      h += "  { %u, %u, %u, %u }" % (r[0], r[1], r[2], r[3])
      if i < len(problemSizes.indicesSized)-1:
        h += ","
      h += "\n"
    h += "  };\n"

    numIndicesMapped = len(problemSizes.indicesMapped)
    h += "const unsigned int numIndicesMapped = %u;\n" % numIndicesMapped
    if numIndicesMapped > 0:
      h += "#define Tensile_INDICES_MAPPED 1\n"
      h += "const unsigned int indicesMapped[numIndicesMapped] = {"
      for i in range(0, numIndicesMapped):
        h += " %u" % problemSizes.indicesMapped[i]
        if i < numIndicesMapped-1:
          h += ","
      h += " };\n"
    else:
      h += "#define Tensile_INDICES_MAPPED 0\n"
    """

    ##############################################################################
    # Max Problem Sizes
    ##############################################################################
    if forBenchmark:
        h += "size_t maxSizeC = %u;\n" % (problemSizes.maxC)
        h += "size_t maxSizeA = %u;\n" % (problemSizes.maxA)
        h += "size_t maxSizeB = %u;\n" % (problemSizes.maxB)
        h += "\n"
    else:
        h += "size_t maxSizeC;\n"
        h += "size_t maxSizeA;\n"
        h += "size_t maxSizeB;\n"
        h += "\n"

    ##############################################################################
    # Current Problem Size
    ##############################################################################
    h += "/* current problem size */\n"
    #h += "unsigned int fullSizes[maxNumIndices];\n"
    #h += "unsigned int currentSizedIndexSizes[numIndicesSized];\n"
    #h += "unsigned int currentSizedIndexIncrements[numIndicesSized];\n"
    h += "\n"

    ##############################################################################
    # Solutions
    ##############################################################################
    if forBenchmark:
        # Solution Ptrs
        h += "typedef TensileStatus (*SolutionFunctionPointer)(\n"
        argList = solutionWriter.getArgList(solutions[0]["ProblemType"], True,
                                            True, True)
        for i in range(0, len(argList)):
            h += "  %s %s%s" % (argList[i][0], argList[i][1], \
                ",\n" if i < len(argList)-1 else ");\n\n")

        h += "struct ClientSolutionInfo {\n"
        h += "  SolutionFunctionPointer functionPtr;\n"
        h += "  const char *            name;\n"
        # These are assertions used to generate the solution
        # Must be checked by the runtime before launchin the solution
        h += "  int                     assertSummationElementMultiple;\n"
        h += "  int                     assertFree0ElementMultiple;\n"
        h += "};\n"

        h += "/* solutions */\n"
        # Problem Type Indices
        h += "const unsigned int maxNumSolutions = %u;\n" % len(solutions)
        h += "float solutionPerf[numProblems][maxNumSolutions]; // milliseconds\n"
        h += "\n"

        h += "static const ClientSolutionInfo solutions[maxNumSolutions] = {\n"
        for i in range(0, len(solutions)):
            solution = solutions[i]
            solutionName = solutionWriter.getSolutionName(solution)
            # add trailing ~ for some reason to the function name
            h += "  {%s, \"%s~\", %d, %d}" % \
              (solutionName, solutionName,
                solution["AssertSummationElementMultiple"],
                solution["AssertFree0ElementMultiple"])
            if i < len(solutions) - 1:
                h += ","
            h += "\n"
        h += " };\n"
        h += "\n"

    else:
        # Function Names
        functionNames = []
        for dataType in dataTypes:
            for problemType in problemTypesForDataType[dataType]:
                # example scheduleName is fiji, vega10, etc
                for scheduleName in schedulesForProblemType[problemType]:
                    functionNames.append("tensile_%s" % (problemType))
        h += "const char *functionNames[numFunctions] = {\n"
        for functionIdx in range(0, len(functionNames)):
            functionName = functionNames[functionIdx]
            h += "    \"%s\"%s\n" % (functionName, \
                "," if functionIdx < len(functionNames)-1 else "" )
        h += " };\n"

    ##############################################################################
    # Runtime Structures
    ##############################################################################
    h += "/* runtime structures */\n"
    h += "TensileStatus status;\n"
    if globalParameters["RuntimeLanguage"] == "OCL":
        h += "cl_platform_id platform;\n"
        h += "cl_device_id device;\n"
        h += "cl_context context;\n"
        h += "cl_command_queue stream;\n"
    else:
        h += "hipStream_t stream;\n"
        #h += "int deviceIdx = %u;\n" \
        #    % (globalParameters["Device"])
    h += "\n"
    h += "void *deviceC;\n"
    h += "void *deviceA;\n"
    h += "void *deviceB;\n"

    ##############################################################################
    # Benchmarking and Validation Parameters
    ##############################################################################
    h += "\n/* benchmarking parameters */\n"
    #h += "const bool measureKernelTime = %s;\n" \
    #    % ("true" if globalParameters["KernelTime"] else "false")
    #h += "const unsigned int numEnqueuesPerSync = %u;\n" \
    #    % (globalParameters["EnqueuesPerSync"])
    #h += "const unsigned int numSyncsPerBenchmark = %u;\n" \
    #    % (globalParameters["SyncsPerBenchmark"])
    #h += "unsigned int numElementsToValidate = %s;\n" \
    #    % (str(globalParameters["NumElementsToValidate"]) \
    #    if globalParameters["NumElementsToValidate"] >= 0 \
    #    else "0xFFFFFFFF" )
    #h += "unsigned int validationMaxToPrint = %u;\n" \
    #    % globalParameters["ValidationMaxToPrint"]
    #h += "bool validationPrintValids = %s;\n" \
    #    % ("true" if globalParameters["ValidationPrintValids"] else "false")
    h += "size_t validationStride;\n"
    if problemType["HighPrecisionAccumulate"]:
        h += "static bool useHighPrecisionAccumulate = true;\n"
    else:
        h += "static bool useHighPrecisionAccumulate = false;\n"
    #h += "unsigned int dataInitTypeC = %s;\n" % globalParameters["DataInitTypeC"]
    #h += "unsigned int dataInitTypeAB = %s;\n" % globalParameters["DataInitTypeAB"]
    h += "\n"

    ##############################################################################
    # Generated Call to Reference
    ##############################################################################
    h += "/* generated call to reference */\n"
    h += "template<typename DataType>\n"
    h += "TensileStatus generatedCallToReferenceCPU(\n"
    h += "    const unsigned int *sizes,\n"
    h += "    const unsigned int *minStrides,\n"
    h += "    DataType *referenceC,\n"
    h += "    DataType *initialA,\n"
    h += "    DataType *initialB,\n"
    h += "    const unsigned int stride_a,\n"
    h += "    const unsigned int stride_b,\n"
    h += "    const unsigned int stride_c,\n"
    h += "    DataType alpha,\n"
    h += "    DataType beta,\n"
    h += "    bool useHighPrecisionAccumulate) {\n"
    h += "  return tensileReferenceCPU(\n"
    h += "      referenceC,\n"
    h += "      initialA,\n"
    h += "      initialB,\n"
    h += "      stride_a,\n"
    h += "      stride_b,\n"
    h += "      stride_c,\n"
    h += "      alpha,\n"
    h += "      beta,\n"
    h += "      totalIndices[problemTypeIdx],\n"
    h += "      sizes,\n"
    h += "      minStrides,\n"
    h += "      numIndicesC[problemTypeIdx],\n"
    h += "      numIndicesAB[problemTypeIdx],\n"
    h += "      indexAssignmentsA[problemTypeIdx],\n"
    h += "      indexAssignmentsB[problemTypeIdx],\n"
    h += "      complexConjugateA[problemTypeIdx],\n"
    h += "      complexConjugateB[problemTypeIdx],\n"
    h += "      validationStride,\n"
    h += "      useHighPrecisionAccumulate);\n"
    h += "};\n"
    h += "\n"

    ##############################################################################
    # Generated Call to Solution
    ##############################################################################
    if forBenchmark:
        problemType = solutions[0]["ProblemType"]
        h += "/* generated call to solution */\n"
        h += "template<typename DataType>\n"
        h += "TensileStatus generatedCallToSolution(\n"
        h += "    unsigned int solutionIdx,\n"
        h += "    const unsigned int *sizes,\n"
        h += "    const unsigned int *minStrides,\n"
        h += "    DataType alpha,\n"
        h += "    DataType beta, \n"
        h += "    unsigned int numEvents = 0, \n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "    cl_event *event_wait_list = NULL,\n"
            h += "    cl_event *outputEvent = NULL ) {\n"
        else:
            h += "    hipEvent_t *startEvent = NULL,\n"
            h += "    hipEvent_t *stopEvent = NULL ) {\n"

        h += "  // calculate parameters assuming packed data\n"
        # strides
        indexChars = globalParameters["IndexChars"]
        firstStride = 1
        if problemType["UseInitialStrides"]:
            firstStride = 0
        lastStrideC = problemType["NumIndicesC"]
        lastStrideA = len(problemType["IndexAssignmentsA"])
        lastStrideB = len(problemType["IndexAssignmentsB"])

        # calculate strides
        for i in range(0, lastStrideC):
            h += "  unsigned int strideC%u%s = 1" % (i, indexChars[i])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % (j, j)
            h += ";\n"
        for i in range(0, lastStrideA):
            h += "  unsigned int strideA%u%s = 1" % (i, \
                indexChars[problemType["IndexAssignmentsA"][i]])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % \
                  (problemType["IndexAssignmentsA"][j],
                   problemType["IndexAssignmentsA"][j])
            h += ";\n"
        for i in range(0, lastStrideB):
            h += "  unsigned int strideB%u%s = 1" % (i, \
                indexChars[problemType["IndexAssignmentsB"][i]])
            for j in range(0, i):
                h += "* std::max(minStrides[%i], sizes[%i])" % \
                  (problemType["IndexAssignmentsB"][j],
                   problemType["IndexAssignmentsB"][j])
            h += ";\n"
        for i in range(0, problemType["TotalIndices"]):
            h += "  unsigned int size%s = sizes[%u];\n" % (indexChars[i], i)
        h += "\n"

        # function call
        h += "  // Check assertions,\n"
        h += writeSolutionAssertionCheckHeader(problemType)
        h += "if (!(\n  "
        h += writeSolutionAssertionChecks(
            "solutions[solutionIdx].assertSummationElementMultiple",
            "solutions[solutionIdx].assertFree0ElementMultiple", "\n  ")
        h += "\n)) { return tensileStatusAssertFailure; } // failed solution requirements\n"
        h += "\n"

        h += "  // call solution function\n"
        h += "  auto f = solutions[solutionIdx].functionPtr;\n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "  return f( static_cast<cl_mem>(deviceC), static_cast<cl_mem>(deviceA), static_cast<cl_mem>(deviceB),\n"
        else:
            typeName = dataTypes[0].toCpp()
            h += "  return f( static_cast<%s *>(deviceC), static_cast<%s *>(deviceA), static_cast<%s *>(deviceB),\n" \
                % (typeName, typeName, typeName)
        h += "      alpha,\n"
        if problemType["UseBeta"]:
            h += "      beta,\n"
        h += "      0, 0, 0, // offsets\n"
        for i in range(firstStride, lastStrideC):
            h += "      strideC%u%s,\n" % (i, indexChars[i])
        for i in range(firstStride, lastStrideA):
            h += "      strideA%u%s,\n" % (i, \
                indexChars[problemType["IndexAssignmentsA"][i]])
        for i in range(firstStride, lastStrideB):
            h += "      strideB%u%s,\n" % (i, \
                indexChars[problemType["IndexAssignmentsB"][i]])
        for i in range(0, problemType["TotalIndices"]):
            h += "      size%s,\n" % indexChars[i]
        h += "      stream,\n"
        if globalParameters["RuntimeLanguage"] == "OCL":
            h += "      numEvents, event_wait_list, outputEvent ); // events\n"
        else:
            h += "      numEvents, startEvent, stopEvent); // events\n"

        h += "};\n"
        h += "\n"
    else:
        ############################################################################
        # Generated Call to Function
        ############################################################################
        for enqueue in [True, False]:
            functionName = "tensile" if enqueue else "tensileGetSolutionName"
            returnName = "TensileStatus" if enqueue else "const char *"
            h += "/* generated call to function */\n"
            h += "template<typename DataType>\n"
            h += "%s generatedCallTo_%s(\n" % (returnName, functionName)
            h += "    unsigned int *sizes,\n"
            h += "    unsigned int *minStrides,\n"
            h += "    DataType alpha,\n"
            h += "    DataType beta, \n"
            h += "    unsigned int strideA, \n"
            h += "    unsigned int strideB, \n"
            h += "    unsigned int strideC, \n"
            h += "    unsigned int numEvents = 0, \n"

            if globalParameters["RuntimeLanguage"] == "OCL":
                h += "    cl_event *event_wait_list = NULL,\n"
                h += "    cl_event *outputEvent = NULL );\n\n"
            else:
                h += "    hipEvent_t *startEvent = NULL,\n"
                h += "    hipEvent_t *stopEvent = NULL );\n\n"

            for dataType in dataTypes:
                typeName = dataType.toCpp()
                functionsForDataType = []
                for problemType in problemTypesForDataType[dataType]:
                    for scheduleName in schedulesForProblemType[problemType]:
                        functionsForDataType.append(
                            [scheduleName, problemType])
                h += "template<>\n"
                h += "inline %s generatedCallTo_%s<%s>(\n" \
                    % (returnName, functionName, typeName)
                h += "    unsigned int *sizes,\n"
                h += "    unsigned int *minStrides,\n"
                h += "    %s alpha,\n" % typeName
                h += "    %s beta,\n" % typeName
                h += "    unsigned int strideA, \n"
                h += "    unsigned int strideB, \n"
                h += "    unsigned int strideC, \n"
                h += "    unsigned int numEvents, \n"

                if globalParameters["RuntimeLanguage"] == "OCL":
                    h += "    cl_event *event_wait_list,\n"
                    h += "    cl_event *outputEvent ) {\n\n"
                else:
                    h += "    hipEvent_t *startEvent,\n"
                    h += "    hipEvent_t *stopEvent ) {\n\n"

                h += "  unsigned int functionIdxForDataType = functionInfo[functionIdx][4];\n"

                for functionIdx in range(0, len(functionsForDataType)):
                    function = functionsForDataType[functionIdx]
                    scheduleName = function[0]
                    problemType = function[1]
                    if len(functionsForDataType) > 1:
                        if functionIdx == 0:
                            h += "  if (functionIdxForDataType == %u) {\n" % functionIdx
                        elif functionIdx == len(functionsForDataType) - 1:
                            h += "  } else {\n"
                        else:
                            h += "  } else if (functionIdxForDataType == %u) {\n" \
                                % functionIdx

                    # strides
                    indexChars = globalParameters["IndexChars"]
                    firstStride = 1
                    if problemType["UseInitialStrides"]:
                        firstStride = 0
                    lastStrideC = problemType["NumIndicesC"]
                    lastStrideA = len(problemType["IndexAssignmentsA"])
                    lastStrideB = len(problemType["IndexAssignmentsB"])

                    # calculate strides
                    for i in range(0, lastStrideC):
                        h += "    unsigned int strideC%u%s = 1" % (
                            i, indexChars[i])
                        for j in range(0, i):
                            h += "*sizes[%i]" % j
                        h += ";\n"
                    h += "    if (strideC != std::numeric_limits<unsigned int>::max())  strideC%u%s = strideC;\n" % (
                        lastStrideC - 1, indexChars[lastStrideC - 1])

                    for i in range(0, lastStrideA):
                        h += "    unsigned int strideA%u%s = 1" % (i, \
                            indexChars[problemType["IndexAssignmentsA"][i]])
                        for j in range(0, i):
                            h += "*sizes[%i]" % \
                              problemType["IndexAssignmentsA"][j]
                        h += ";\n"
                    h += "    if (strideA != std::numeric_limits<unsigned int>::max())  strideA%u%s = strideA;\n" % (
                        lastStrideA - 1, indexChars[
                            problemType["IndexAssignmentsA"][lastStrideA - 1]])
                    for i in range(0, lastStrideB):
                        h += "    unsigned int strideB%u%s = 1" % (i, \
                            indexChars[problemType["IndexAssignmentsB"][i]])
                        for j in range(0, i):
                            h += "*sizes[%i]" % \
                              problemType["IndexAssignmentsB"][j]
                        h += ";\n"
                    h += "    if (strideB != std::numeric_limits<unsigned int>::max())  strideB%u%s = strideB;\n" % (
                        lastStrideB - 1, indexChars[
                            problemType["IndexAssignmentsB"][lastStrideB - 1]])
                    for i in range(0, problemType["TotalIndices"]):
                        h += "    unsigned int size%s = sizes[%u];\n" % (
                            indexChars[i], i)

                    # function call
                    h += "    // call solution function\n"
                    h += "    return %s_%s(\n" % (functionName, problemType)
                    if enqueue:
                        if globalParameters["RuntimeLanguage"] == "OCL":
                            h += "        static_cast<cl_mem>(deviceC),\n"
                            h += "        static_cast<cl_mem>(deviceA),\n"
                            h += "        static_cast<cl_mem>(deviceB),\n"
                        else:
                            h += "        static_cast<%s *>(deviceC),\n" % typeName
                            h += "        static_cast<%s *>(deviceA),\n" % typeName
                            h += "        static_cast<%s *>(deviceB),\n" % typeName
                        h += "        alpha,\n"
                        if problemType["UseBeta"]:
                            h += "        beta,\n"
                        h += "        0, 0, 0, // offsets\n"
                    for i in range(firstStride, lastStrideC):
                        h += "        strideC%u%s,\n" % (i, indexChars[i])
                    for i in range(firstStride, lastStrideA):
                        h += "        strideA%u%s,\n" % (i, \
                            indexChars[problemType["IndexAssignmentsA"][i]])
                    for i in range(firstStride, lastStrideB):
                        h += "        strideB%u%s,\n" % (i, \
                            indexChars[problemType["IndexAssignmentsB"][i]])
                    for i in range(0, problemType["TotalIndices"]):
                        h += "        size%s,\n" % indexChars[i]
                    h += "        stream"
                    if enqueue:
                        if globalParameters["RuntimeLanguage"] == "OCL":
                            h += ",\n        numEvents, event_wait_list, outputEvent"
                        else:
                            h += ",\n        numEvents, startEvent, stopEvent"
                    h += ");\n"

                if len(functionsForDataType) > 1:
                    h += "  }\n"  # close last if
                h += "};\n"  # close callToFunction

    ##############################################################################
    # Results File Name
    ##############################################################################
    if forBenchmark:
        h += "/* results file name */\n"
        resultsFileName = os.path.join(globalParameters["WorkingPath"], \
            "../../Data","%s.csv" % stepName)
        resultsFileName = resultsFileName.replace("\\", "\\\\")
        h += "const char *resultsFileName = \"%s\";\n" % resultsFileName

    ##############################################################################
    # Write File
    ##############################################################################
    clientParametersFile = open(os.path.join(globalParameters["WorkingPath"], \
        "ClientParameters.h"), "w")
    clientParametersFile.write(CHeader)
    clientParametersFile.write(h)
    clientParametersFile.close()