def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose): """runs the kernel once and checks the result against answer""" logging.debug('check_kernel_output') #if not using custom verify function, check if the length is the same if not verify and len(instance.arguments) != len(answer): raise TypeError( "The length of argument list and provided results do not match." ) #re-copy original contents of output arguments to GPU memory, to overwrite any changes #by earlier kernel runs for i, arg in enumerate(instance.arguments): if (verify or answer[i] is not None) and isinstance( arg, (np.ndarray, cp.ndarray, torch.Tensor)): self.dev.memcpy_htod(gpu_args[i], arg) #run the kernel check = self.run_kernel(func, gpu_args, instance) if not check: return True #runtime failure occured that should be ignored, skip correctness check #retrieve gpu results to host memory result_host = [] for i, arg in enumerate(instance.arguments): if (verify or answer[i] is not None) and isinstance( arg, (np.ndarray, cp.ndarray)): result_host.append(np.zeros_like(arg)) self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) elif isinstance(arg, torch.Tensor) and isinstance( answer[i], torch.Tensor): if not answer[i].is_cuda: #if the answer is on the host, copy gpu output to host as well result_host.append(torch.zeros_like(answer[i])) self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) else: result_host.append(gpu_args[i].tensor) else: result_host.append(None) #if the user has specified a custom verify function, then call it, else use default based on numpy allclose if verify: correct = verify(answer, result_host, atol=atol) else: correct = _default_verify_function(instance, answer, result_host, atol, verbose) if not correct: raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) return True
def run(self, parameter_space, kernel_options, tuning_options): """ Iterate through the entire parameter space using a single Python process :param parameter_space: The parameter space as an iterable. :type parameter_space: iterable :param kernel_options: A dictionary with all options for the kernel. :type kernel_options: kernel_tuner.interface.Options :param tuning_options: A dictionary with all options regarding the tuning process. :type tuning_options: kernel_tuner.iterface.Options :returns: A list of dictionaries for executed kernel configurations and their execution times. And a dictionary that contains a information about the hardware/software environment on which the tuning took place. :rtype: list(dict()), dict() """ logging.debug('sequential runner started for ' + kernel_options.kernel_name) results = [] #iterate over parameter space for element in parameter_space: params = OrderedDict( zip(tuning_options.tune_params.keys(), element)) time = self.dev.compile_and_benchmark(self.gpu_args, params, kernel_options, tuning_options) if time is None: logging.debug( 'received time is None, kernel configuration was skipped silently due to compile or runtime failure' ) continue #print and append to results params['time'] = time output_string = get_config_string(params, self.units) logging.debug(output_string) if not self.quiet: print(output_string) results.append(params) return results, self.dev.get_environment()
def run(self, parameter_space, kernel_options, tuning_options): """ Iterate through the entire parameter space using a single Python process :param parameter_space: The parameter space as an iterable. :type parameter_space: iterable :param kernel_options: A dictionary with all options for the kernel. :type kernel_options: kernel_tuner.interface.Options :param tuning_options: A dictionary with all options regarding the tuning process. :type tuning_options: kernel_tuner.iterface.Options :returns: A list of dictionaries for executed kernel configurations and their execution times. And a dictionary that contains a information about the hardware/software environment on which the tuning took place. :rtype: list(dict()), dict() """ logging.debug('sequential runner started for ' + kernel_options.kernel_name) results = [] #iterate over parameter space for element in parameter_space: params = OrderedDict(zip(tuning_options.tune_params.keys(), element)) time = self.dev.compile_and_benchmark(self.gpu_args, params, kernel_options, tuning_options) if time is None: logging.debug('received time is None, kernel configuration was skipped silently due to compile or runtime failure') continue #print and append to results params['time'] = time output_string = get_config_string(params, self.units) logging.debug(output_string) if not self.quiet: print(output_string) results.append(params) return results, self.dev.get_environment()
def tune_kernel(kernel_name, kernel_string, problem_size, arguments, tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None, restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False, lang=None, device=0, platform=0, cmem_args=None, num_threads=1, use_noodles=False, sample_fraction=False, compiler=None, compiler_options=None, log=None, iterations=7, times=False, block_size_names=None, quiet=False, strategy=None, method=None): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log) _check_user_input(kernel_name, kernel_string, arguments, block_size_names) # check for forbidden names in tune parameters util.check_tune_params_list(tune_params) # check whether block_size_names are used as expected util.check_block_size_params_names_list(block_size_names, tune_params) if iterations < 1: raise ValueError("Iterations should be at least one!") #sort all the options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) logging.debug('tune_kernel called') logging.debug('kernel_options: %s', util.get_config_string(kernel_options)) logging.debug('tuning_options: %s', util.get_config_string(tuning_options)) logging.debug('device_options: %s', util.get_config_string(device_options)) #select strategy based on user options if sample_fraction and not strategy in [None, 'sample_fraction']: raise ValueError("It's not possible to use both sample_fraction in combination with other strategies. " \ 'Please set strategy=None or strategy="random_sample", when using sample_fraction') if strategy in [None, 'sample_fraction', 'brute_force']: if sample_fraction: use_strategy = random_sample else: use_strategy = brute_force elif strategy in ["minimize", "basinhopping"]: if method: if not (method in [ "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP" ] or callable(method)): raise ValueError("method option not recognized") else: method = "L-BFGS-B" if strategy == "minimize": use_strategy = minimize else: use_strategy = basinhopping elif strategy == "diff_evo": use_strategy = diff_evo if method: if not method in [ "best1bin", "best1exp", "rand1exp", "randtobest1exp", "best2exp", "rand2exp", "randtobest1bin", "best2bin", "rand2bin", "rand1bin" ]: raise ValueError("method option not recognized") else: raise ValueError("strategy option not recognized") strategy = use_strategy #select runner based on user options if num_threads == 1 and not use_noodles: from kernel_tuner.runners.sequential import SequentialRunner runner = SequentialRunner(kernel_options, device_options, iterations) elif num_threads > 1 and not use_noodles: raise ValueError( "Using multiple threads requires the Noodles runner, use use_noodles=True" ) elif use_noodles: #check if Python version matches required by Noodles if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 5): raise ValueError( "Using multiple threads requires Noodles, Noodles requires Python 3.5 or higher" ) #check if noodles is installed in a way that works with Python 3.4 or newer noodles_installed = importlib.util.find_spec("noodles") is not None if not noodles_installed: raise ValueError( "Using multiple threads requires Noodles, please use 'pip install noodles'" ) #import the NoodlesRunner from kernel_tuner.runners.noodles import NoodlesRunner runner = NoodlesRunner(device_options, num_threads) else: raise ValueError( "Somehow no runner was selected, this should not happen, please file a bug report" ) #call the strategy to execute the tuning process results, env = strategy.tune(runner, kernel_options, device_options, tuning_options) #finished iterating over search space if not device_options.quiet: if results: #checks if results is not empty best_config = min(results, key=lambda x: x['time']) units = getattr(runner, "units", None) print("best performing configuration:", util.get_config_string(best_config, units=units)) else: print("no results to report") del runner.dev return results, env
def _default_verify_function(instance, answer, result_host, atol, verbose): """default verify function based on numpy.allclose""" #first check if the length is the same if len(instance.arguments) != len(answer): raise TypeError("The length of argument list and provided results do not match.") #for each element in the argument list, check if the types match for i, arg in enumerate(instance.arguments): if answer[i] is not None: #skip None elements in the answer list if isinstance(answer[i], numpy.ndarray) and isinstance(arg, numpy.ndarray): if answer[i].dtype != arg.dtype: raise TypeError("Element " + str(i) + " of the expected results list is not of the same dtype as the kernel output: " + str(answer[i].dtype) + " != " + str(arg.dtype) + ".") if answer[i].size != arg.size: raise TypeError("Element " + str(i) + " of the expected results list has a size different from " + "the kernel argument: " + str(answer[i].size) + " != " + str(arg.size) + ".") elif isinstance(answer[i], numpy.number) and isinstance(arg, numpy.number): if answer[i].dtype != arg.dtype: raise TypeError("Element " + str(i) + " of the expected results list is not the same as the kernel output: " + str(answer[i].dtype) + " != " + str(arg.dtype) + ".") else: #either answer[i] and argument have different types or answer[i] is not a numpy type if not isinstance(answer[i], numpy.ndarray) or not isinstance(answer[i], numpy.number): raise TypeError("Element " + str(i) + " of expected results list is not a numpy array or numpy scalar.") else: raise TypeError("Element " + str(i) + " of expected results list and kernel arguments have different types.") def _ravel(a): if hasattr(a, 'ravel') and len(a.shape) > 1: return a.ravel() return a def _flatten(a): if hasattr(a, 'flatten'): return a.flatten() return a correct = True for i, arg in enumerate(instance.arguments): expected = answer[i] if expected is not None: result = _ravel(result_host[i]) expected = _flatten(expected) output_test = numpy.allclose(expected, result, atol=atol) if not output_test and verbose: print("Error: " + util.get_config_string(instance.params) + " detected during correctness check") print("this error occured when checking value of the %oth kernel argument" % (i, )) print("Printing kernel output and expected result, set verbose=False to suppress this debug print") numpy.set_printoptions(edgeitems=50) print("Kernel output:") print(result) print("Expected:") print(expected) correct = correct and output_test if not correct: logging.debug('correctness check has found a correctness issue') return correct
def tune_kernel(kernel_name, kernel_string, problem_size, arguments, tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None, restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False, lang=None, device=0, platform=0, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, log=None, iterations=7, block_size_names=None, quiet=False, strategy=None, strategy_options=None, cache=None): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log) kernel_source = core.KernelSource(kernel_string, lang) _check_user_input(kernel_name, kernel_source, arguments, block_size_names) # check for forbidden names in tune parameters util.check_tune_params_list(tune_params) # check whether block_size_names are used as expected util.check_block_size_params_names_list(block_size_names, tune_params) if iterations < 1: raise ValueError("Iterations should be at least one!") #sort all the options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) logging.debug('tune_kernel called') logging.debug('kernel_options: %s', util.get_config_string(kernel_options)) logging.debug('tuning_options: %s', util.get_config_string(tuning_options)) logging.debug('device_options: %s', util.get_config_string(device_options)) if strategy: if strategy in strategy_map: strategy = strategy_map[strategy] else: raise ValueError("Strategy %s not recognized" % strategy) #make strategy_options into an Options object if tuning_options.strategy_options: if not isinstance(strategy_options, Options): tuning_options.strategy_options = Options(strategy_options) #select strategy based on user options if "fraction" in tuning_options.strategy_options and not tuning_options.strategy == 'random_sample': raise ValueError('It is not possible to use fraction in combination with strategies other than "random_sample". ' \ 'Please set strategy="random_sample", when using "fraction" in strategy_options') #check if method is supported by the selected strategy if "method" in tuning_options.strategy_options: method = tuning_options.strategy_options.method if not method in strategy.supported_methods: raise ValueError('Method %s is not supported for strategy %s' % (method, tuning_options.strategy)) #if no strategy_options dict has been passed, create empty dictionary else: tuning_options.strategy_options = Options({}) #if no strategy selected else: strategy = brute_force runner = SequentialRunner(kernel_source, kernel_options, device_options, iterations) #the user-specified function may or may not have an optional atol argument; #we normalize it so that it always accepts atol. tuning_options.verify = util.normalize_verify_function(tuning_options.verify) #process cache if cache: if cache[-5:] != ".json": cache += ".json" util.process_cache(cache, kernel_options, tuning_options, runner) else: tuning_options.cache = {} tuning_options.cachefile = None #call the strategy to execute the tuning process results, env = strategy.tune(runner, kernel_options, device_options, tuning_options) #finished iterating over search space if not device_options.quiet: if results: #checks if results is not empty best_config = min(results, key=lambda x: x['time']) units = getattr(runner, "units", None) print("best performing configuration:", util.get_config_string(best_config, list(tune_params.keys()) + ['time'], units=units)) else: print("no results to report") if cache: util.close_cache(cache) del runner.dev return results, env
def tune(): with open('reduction.cl', 'r') as f: kernel_string = f.read() tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5, 11)] tune_params["vector"] = [2**i for i in range(3)] tune_params["num_blocks"] = [2**i for i in range(5, 11)] problem_size = "num_blocks" size = 80000000 max_blocks = max(tune_params["num_blocks"]) x = numpy.random.rand(size).astype(numpy.float32) sum_x = numpy.zeros(max_blocks).astype(numpy.float32) n = numpy.int32(size) args = [sum_x, x, n] #prepare output verification with custom function reference = [numpy.sum(x), None, None] def verify_partial_reduce(cpu_result, gpu_result, atol=None): return numpy.isclose(cpu_result, numpy.sum(gpu_result), atol=atol) #tune the first kernel first_kernel, _ = tune_kernel("sum_floats", kernel_string, problem_size, args, tune_params, grid_div_x=[], verbose=True, answer=reference, verify=verify_partial_reduce) #tune the second kernel for different input sizes #depending on the number of blocks used in the first kernel #store the parameter list used in the first kernel num_blocks = tune_params["num_blocks"] #fix num_blocks parameter to only 1 for the second kernel tune_params["num_blocks"] = [1] second_kernel = dict() for nblocks in num_blocks: #change the input size to nblocks args = [sum_x, x, numpy.int32(nblocks)] #tune the second kernel with n=nblocks result, _ = tune_kernel("sum_floats", kernel_string, problem_size, args, tune_params, grid_div_x=[], verbose=True) with open("reduce-kernel2-" + str(nblocks) + ".json", 'w') as fp: json.dump(result, fp) #only keep the best performing config second_kernel[nblocks] = min(result, key=lambda x: x['time']) #combine the results from the first kernel with best #second kernel that uses the same num_blocks for i, instance in enumerate(first_kernel): first_kernel[i]["total"] = instance["time"] + second_kernel[ instance["num_blocks"]]["time"] best_config = min(first_kernel, key=lambda x: x['total']) print("Best performing config: \n" + get_config_string(best_config)) print("uses the following config for the secondary kernel:") print(get_config_string(second_kernel[best_config["num_blocks"]])) with open("reduce.json", 'w') as fp: json.dump(first_kernel, fp) return first_kernel, second_kernel
def _default_verify_function(instance, answer, result_host, atol, verbose): """default verify function based on numpy.allclose""" #first check if the length is the same if len(instance.arguments) != len(answer): raise TypeError("The length of argument list and provided results do not match.") #for each element in the argument list, check if the types match for i, arg in enumerate(instance.arguments): if answer[i] is not None: #skip None elements in the answer list if isinstance(answer[i], numpy.ndarray) and isinstance(arg, numpy.ndarray): if answer[i].dtype != arg.dtype: raise TypeError("Element " + str(i) + " of the expected results list is not of the same dtype as the kernel output: " + str(answer[i].dtype) + " != " + str(arg.dtype) + ".") if answer[i].size != arg.size: raise TypeError("Element " + str(i) + " of the expected results list has a size different from " + "the kernel argument: " + str(answer[i].size) + " != " + str(arg.size) + ".") elif isinstance(answer[i], numpy.number) and isinstance(arg, numpy.number): if answer[i].dtype != arg.dtype: raise TypeError("Element " + str(i) + " of the expected results list is not the same as the kernel output: " + str(answer[i].dtype) + " != " + str(arg.dtype) + ".") else: #either answer[i] and argument have different types or answer[i] is not a numpy type if not isinstance(answer[i], numpy.ndarray) or not isinstance(answer[i], numpy.number): raise TypeError("Element " + str(i) + " of expected results list is not a numpy array or numpy scalar.") else: raise TypeError("Element " + str(i) + " of expected results list and kernel arguments have different types.") def _ravel(a): if hasattr(a, 'ravel') and len(a.shape) > 1: return a.ravel() return a def _flatten(a): if hasattr(a, 'flatten'): return a.flatten() return a correct = True for i, arg in enumerate(instance.arguments): expected = answer[i] if expected is not None: result = _ravel(result_host[i]) expected = _flatten(expected) output_test = numpy.allclose(expected, result, atol=atol) if not output_test and verbose: print("Error: " + util.get_config_string(instance.params) + " detected during correctness check") print("this error occured when checking value of the %oth kernel argument" % (i,)) print("Printing kernel output and expected result, set verbose=False to suppress this debug print") numpy.set_printoptions(edgeitems=50) print("Kernel output:") print(result) print("Expected:") print(expected) correct = correct and output_test if not correct: logging.debug('correctness check has found a correctness issue') raise Exception("Error: " + util.get_config_string(instance.params) + " failed correctness check") return correct
def tune_hyper_params(target_strategy, hyper_params, *args, **kwargs): """ Tune hyperparameters for a given strategy and kernel This function is to be called just like tune_kernel, except that you specify a strategy and a dictionary with hyperparameters in front of the arguments you pass to tune_kernel. The arguments to tune_kernel should contain a cachefile. To compute the optimum the hyperparameter tuner first tunes the kernel with a brute force search. If your cachefile is not yet complete this may take very long. :param target_strategy: Specify the strategy for which to tune hyperparameters :type target_strategy: string :param hyper_params: A dictionary containing the hyperparameters as keys and lists the possible values per key :type hyper_params: dict(string: list) :param args: all positional arguments used to call tune_kernel :type args: various :param kwargs: other keyword arguments to pass to tune_kernel :type kwargs: dict """ if not "cache" in kwargs: raise ValueError( "Please specify a cachefile to store benchmarking data when tuning hyperparameters" ) def put_if_not_present(d, key, value): d[key] = value if not key in d else d[key] put_if_not_present(kwargs, "verbose", False) put_if_not_present(kwargs, "quiet", True) put_if_not_present(kwargs, "simulation_mode", True) kwargs['strategy'] = 'brute_force' #last position argument is tune_params tune_params = args[-1] #find optimum kwargs["strategy"] = "brute_force" results, env = kernel_tuner.tune_kernel(*args, **kwargs) optimum = min(results, key=lambda p: p["time"])["time"] #could throw a warning for the kwargs that will be overwritten, strategy(_options) kwargs["strategy"] = target_strategy parameter_space = itertools.product(*hyper_params.values()) all_results = [] for params in parameter_space: strategy_options = dict(zip(hyper_params.keys(), params)) kwargs["strategy_options"] = strategy_options fevals = [] p_of_opt = [] for _ in range(100): #measure with warnings.catch_warnings(): warnings.simplefilter("ignore") results, env = kernel_tuner.tune_kernel(*args, **kwargs) #get unique function evaluations unique_fevals = { ",".join( [str(v) for k, v in record.items() if k in tune_params]) for record in results } fevals.append(len(unique_fevals)) # p_of_opt.append(optimum / min(results, key=lambda p: p["time"])["time"] * 100) p_of_opt.append( min(results, key=lambda p: p["time"])["time"] / optimum * 100) strategy_options["fevals"] = np.average(fevals) strategy_options["fevals_std"] = np.std(fevals) strategy_options["p_of_opt"] = np.average(p_of_opt) strategy_options["p_of_opt_std"] = np.std(p_of_opt) print(get_config_string(strategy_options)) all_results.append(strategy_options) return all_results
def tune(): with open('reduction.cl', 'r') as f: kernel_string = f.read() tune_params = OrderedDict() tune_params["block_size_x"] = [2**i for i in range(5,11)] tune_params["vector"] = [2**i for i in range(3)] tune_params["num_blocks"] = [2**i for i in range(5,11)] problem_size = "num_blocks" size = 80000000 max_blocks = max(tune_params["num_blocks"]) x = numpy.random.rand(size).astype(numpy.float32) sum_x = numpy.zeros(max_blocks).astype(numpy.float32) n = numpy.int32(size) args = [sum_x, x, n] #prepare output verification with custom function reference = [numpy.sum(x), None, None] def verify_partial_reduce(cpu_result, gpu_result, atol=None): return numpy.isclose(cpu_result, numpy.sum(gpu_result), atol=atol) #tune the first kernel first_kernel, _ = tune_kernel("sum_floats", kernel_string, problem_size, args, tune_params, grid_div_x=[], verbose=True, answer=reference, verify=verify_partial_reduce) #tune the second kernel for different input sizes #depending on the number of blocks used in the first kernel #store the parameter list used in the first kernel num_blocks = tune_params["num_blocks"] #fix num_blocks parameter to only 1 for the second kernel tune_params["num_blocks"] = [1] second_kernel = dict() for nblocks in num_blocks: #change the input size to nblocks args = [sum_x, x, numpy.int32(nblocks)] #tune the second kernel with n=nblocks result, _ = tune_kernel("sum_floats", kernel_string, problem_size, args, tune_params, grid_div_x=[], verbose=True) with open("reduce-kernel2-" + str(nblocks) + ".json", 'w') as fp: json.dump(result, fp) #only keep the best performing config second_kernel[nblocks] = min(result, key=lambda x:x['time']) #combine the results from the first kernel with best #second kernel that uses the same num_blocks for i, instance in enumerate(first_kernel): first_kernel[i]["total"] = instance["time"] + second_kernel[instance["num_blocks"]]["time"] best_config = min(first_kernel, key=lambda x:x['total']) print("Best performing config: \n" + get_config_string(best_config)) print("uses the following config for the secondary kernel:") print(get_config_string(second_kernel[best_config["num_blocks"]])) with open("reduce.json", 'w') as fp: json.dump(first_kernel, fp) return first_kernel, second_kernel
def tune(algorithm, do_strategy): result_summary = {} tune_func = algorithms[algorithm]['method'] test_methods = strategy_options[do_strategy] for method in test_methods: if method: experiment_name = do_strategy + "_" + method else: experiment_name = do_strategy summary = OrderedDict() summary["best"] = [] summary["best_times"] = [] summary["execution_time"] = [] try: #test all methods multiple times because some methods are stochastic for i in range(32 if do_strategy != "brute_force" else 1): outfile = algorithm + "/" + algorithm + "_" + experiment_name if do_strategy != "brute_force": outfile += "_" + str(i) outfile += ".json" if os.path.isfile(outfile): print("output file %s already exists, skipping this experiment" % outfile) continue start = time.time() if 'options' in algorithms[algorithm]: results, env = tune_func(do_strategy, method, algorithms[algorithm]['options']) else: results, env = tune_func(do_strategy, method) end = time.time() env['execution_time'] = end-start gc.collect() with open(outfile, 'w') as fp: json.dump(results, fp) best_config = min(results, key=lambda x:x['time']) summary["best"].append(best_config) summary["best_times"].append(best_config['time']) summary["execution_time"].append(env['execution_time']) finally: if len(summary["best"]) > 0: result_summary[experiment_name] = summary update_results_db(algorithm, result_summary) #print some output at end of run, not strictly necessary with open(algorithm + "/" + algorithm + "_summary.json", 'r') as fp: result_summary = json.load(fp) total_ops = algorithms[algorithm]['total_ops'] unit = algorithms[algorithm]['unit'] for k, d in result_summary.items(): print(k) for i, config in enumerate(d["best"]): print(get_config_string(config), str(total_ops / (config['time'] /1e3)) + " " + unit, str(d["execution_time"][i]) + " sec") print("average best performance: " + str(numpy.average(d["best_times"]))) print("average execution_time: " + str(numpy.average(d["execution_time"])))
def check_kernel_correctness(self, func, gpu_args, instance, answer, atol, verify, verbose): """runs the kernel once and checks the result against answer""" logging.debug('check_kernel_correctness') params = instance.params #zero GPU memory for output arguments for i, arg in enumerate(instance.arguments): if answer[i] is not None: self.dev.memset(gpu_args[i], 0, arg.nbytes) #run the kernel if not self.run_kernel(func, gpu_args, instance): return True #runtime failure occured that should be ignored, skip correctness check def _ravel(a): if hasattr(a, 'ravel') and len(a.shape) > 1: return a.ravel() return a def _flatten(a): if hasattr(a, 'flatten'): return a.flatten() return a #check correctness of each output argument correct = True for i, arg in enumerate(instance.arguments): expected = answer[i] if expected is not None: result_host = numpy.zeros_like(arg) self.dev.memcpy_dtoh(result_host, gpu_args[i]) result_host = _ravel(result_host) expected = _flatten(expected) if verify is None: output_test = numpy.allclose(expected, result_host, atol=atol) else: try: output_test = verify(expected, result_host, atol=atol) except TypeError: output_test = verify(expected, result_host) if not output_test and verbose: print("Error: " + util.get_config_string(params) + " detected during correctness check") print( "Printing kernel output and expected result, set verbose=False to suppress this debug print" ) numpy.set_printoptions(edgeitems=50) print("Kernel output:") print(result_host) print("Expected:") print(expected) correct = correct and output_test del result_host if not correct: logging.debug('correctness check has found a correctness issue') raise Exception("Error: " + util.get_config_string(params) + " failed correctness check") return correct