def _benchmark_algo( benchmark, name, dataset_name, n_samples=10000, n_features=100, input_type='numpy', data_kwargs={}, algo_args={}, ): """Simplest benchmark wrapper to time algorithm 'name' on dataset 'dataset_name'""" algo = algorithms.algorithm_by_name(name) data = datagen.gen_data( dataset_name, input_type, n_samples=n_samples, n_features=n_features, **data_kwargs ) def _benchmark_inner(): algo.run_cuml(data, **algo_args) benchmark(_benchmark_inner)
def _run_one_size( self, algo_pair, n_samples, n_features, param_overrides={}, cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, ): data = datagen.gen_data(self.dataset_name, self.input_type, n_samples, n_features) print("data type: ", data[0].__class__) cu_start = time.time() algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides) cu_elapsed = time.time() - cu_start if run_cpu and algo_pair.cpu_class is not None: cpu_start = time.time() algo_pair.run_cpu(data, **param_overrides) cpu_elapsed = time.time() - cpu_start else: cpu_elapsed = 0.0 return dict(cu_time=cu_elapsed, cpu_time=cpu_elapsed, speedup=cpu_elapsed / float(cu_elapsed), n_samples=n_samples, n_features=n_features, **param_overrides, **cuml_param_overrides)
def test_training_data_to_numpy(input_type): X, y, *_ = datagen.gen_data( 'blobs', input_type, n_samples=100, n_features=10 ) X_np, y_np = _training_data_to_numpy(X, y) assert isinstance(X_np, np.ndarray) assert isinstance(y_np, np.ndarray)
def test_data_generator_split(): X_train, y_train, X_test, y_test = datagen.gen_data('blobs', 'numpy', n_samples=100, n_features=10, test_fraction=0.20) assert X_train.shape == (100, 10) assert X_test.shape == (25, 10)
def test_data_generator_types(input_type): X, *_ = datagen.gen_data('blobs', input_type, n_samples=100, n_features=10) if input_type == 'numpy': assert isinstance(X, np.ndarray) elif input_type == 'cudf': assert isinstance(X, cudf.DataFrame) elif input_type == 'pandas': assert isinstance(X, pd.DataFrame) elif input_type == 'gpuarray': assert cuda.is_cuda_array(X) else: assert False
def _run_one_size( self, algo_pair, n_samples, n_features, param_overrides={}, cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, verbose=False, ): data = datagen.gen_data(self.dataset_name, self.input_type, n_samples, n_features) setup_overrides = algo_pair.setup_cuml(data, **param_overrides, **cuml_param_overrides) cuml_timer = BenchmarkTimer(self.n_reps) for rep in cuml_timer.benchmark_runs(): algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides, **setup_overrides) cu_elapsed = np.min(cuml_timer.timings) if run_cpu and algo_pair.cpu_class is not None: setup_overrides = algo_pair.setup_cpu(data, **param_overrides) cpu_timer = BenchmarkTimer(self.n_reps) for rep in cpu_timer.benchmark_runs(): algo_pair.run_cpu(data, **param_overrides, **setup_overrides) cpu_elapsed = np.min(cpu_timer.timings) else: cpu_elapsed = 0.0 speedup = cpu_elapsed / float(cu_elapsed) if verbose: print("%s Speedup (n_samples=%s, n_features=%s) = %s" % (algo_pair.name, n_samples, n_features, speedup)) return dict(cu_time=cu_elapsed, cpu_time=cpu_elapsed, speedup=speedup, n_samples=n_samples, n_features=n_features, **param_overrides, **cuml_param_overrides)
def test_data_generators(dataset): data = datagen.gen_data(dataset, "numpy", n_samples=100, n_features=10) assert isinstance(data[0], np.ndarray) assert data[0].shape[0] == 100
def _run_one_size( self, algo_pair, n_samples, n_features, param_overrides={}, cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, verbose=False, ): data = datagen.gen_data( self.dataset_name, self.input_type, n_samples, n_features, test_fraction=self.test_fraction, ) setup_override = algo_pair.setup_cuml( data, **{ **param_overrides, **cuml_param_overrides }) cuml_timer = BenchmarkTimer(self.n_reps) for _ in cuml_timer.benchmark_runs(): cuml_model = algo_pair.run_cuml( data, **{ **param_overrides, **cuml_param_overrides, **setup_override }) cu_elapsed = np.min(cuml_timer.timings) if algo_pair.accuracy_function: if algo_pair.cuml_data_prep_hook is not None: X_test, y_test = algo_pair.cuml_data_prep_hook(data[2:]) else: X_test, y_test = data[2:] if hasattr(cuml_model, "predict"): y_pred_cuml = cuml_model.predict(X_test) else: y_pred_cuml = cuml_model.transform(X_test) cuml_accuracy = algo_pair.accuracy_function( y_test, np.asarray(y_pred_cuml)) else: cuml_accuracy = 0.0 cpu_accuracy = 0.0 if run_cpu and algo_pair.cpu_class is not None: setup_override = algo_pair.setup_cpu(data, **param_overrides) cpu_timer = BenchmarkTimer(self.n_reps) for rep in cpu_timer.benchmark_runs(): cpu_model = algo_pair.run_cpu(data, **param_overrides, **setup_override) cpu_elapsed = np.min(cpu_timer.timings) if algo_pair.accuracy_function: if algo_pair.cpu_data_prep_hook is not None: X_test, y_test = algo_pair.cpu_data_prep_hook(data[2:]) else: X_test, y_test = data[2:] if hasattr(cpu_model, "predict"): y_pred_cpu = cpu_model.predict(X_test) else: y_pred_cpu = cpu_model.transform(X_test) cpu_accuracy = algo_pair.accuracy_function( y_test, np.asarray(y_pred_cpu)) else: cpu_elapsed = 0.0 return dict(cu_time=cu_elapsed, cpu_time=cpu_elapsed, cuml_acc=cuml_accuracy, cpu_acc=cpu_accuracy, speedup=cpu_elapsed / float(cu_elapsed), n_samples=n_samples, n_features=n_features, **param_overrides, **cuml_param_overrides)
def _run_one_size( self, algo_pair, n_samples, n_features, param_overrides={}, cuml_param_overrides={}, cpu_param_overrides={}, dataset_param_overrides={}, run_cpu=True, verbose=False, ): data = datagen.gen_data( self.dataset_name, self.input_type, n_samples, n_features, **dataset_param_overrides ) setup_overrides = algo_pair.setup_cuml( data, **param_overrides, **cuml_param_overrides ) cuml_timer = BenchmarkTimer(self.n_reps) for rep in cuml_timer.benchmark_runs(): algo_pair.run_cuml( data, **param_overrides, **cuml_param_overrides, **setup_overrides ) cu_elapsed = np.min(cuml_timer.timings) if run_cpu and algo_pair.cpu_class is not None: setup_overrides = algo_pair.setup_cpu(data, **param_overrides, **cpu_param_overrides) cpu_timer = BenchmarkTimer(self.n_reps) for rep in cpu_timer.benchmark_runs(): algo_pair.run_cpu(data, **param_overrides, **cpu_param_overrides, **setup_overrides) cpu_elapsed = np.min(cpu_timer.timings) else: if run_cpu: warnings.warn("run_cpu argument is set to True but no CPU " "implementation was provided. It's possible " "an additional library is needed but one could " "not be found. Benchmark will be executed with " "run_cpu=False") cpu_elapsed = 0.0 speedup = cpu_elapsed / float(cu_elapsed) if verbose: print( "%s (n_samples=%s, n_features=%s) [cpu=%s, gpu=%s, speedup=%s]" % (algo_pair.name, n_samples, n_features, cpu_elapsed, cu_elapsed, speedup) ) return dict( cu_time=cu_elapsed, cpu_time=cpu_elapsed, speedup=speedup, n_samples=n_samples, n_features=n_features, **param_overrides, **cuml_param_overrides, **cpu_param_overrides, **dataset_param_overrides )
args.dataset_type = params['dataset_type'] if 'n_samples' in params: args.n_samples = params['n_samples'] if 'n_features' in params: args.n_features = params['n_features'] if 'dataset_format' in params: args.dataset_format = params['dataset_format'] if 'data_kwargs' in params: args.data_kwargs = params['data_kwargs'] if 'setup_kwargs' in params: args.setup_kwargs = params['setup_kwargs'] if 'training_kwargs' in params: args.training_kwargs = params['training_kwargs'] if 'inference_kwargs' in params: args.inference_kwargs = params['inference_kwargs'] if len(args.json): parse_json(args) dataset = datagen.gen_data(args.dataset_type, args.dataset_format, n_samples=args.n_samples, n_features=args.n_features, **args.data_kwargs) algo = algorithms.algorithm_by_name(args.algo_name) cuml_setup = setup_bench('cuml', algo, 'inference', dataset, args.setup_kwargs, args.training_kwargs) algo.run_cuml(dataset, bench_args=args.inference_kwargs, **cuml_setup)
def _run_one_size( self, algo_pair, n_samples, n_features, param_overrides={}, cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, ): data = datagen.gen_data( self.dataset_name, self.input_type, n_samples, n_features, test_fraction=self.test_fraction, ) X_test, y_test = data[2:] cu_start = time.time() cuml_model = algo_pair.run_cuml( data, **{ **param_overrides, **cuml_param_overrides }) cu_elapsed = time.time() - cu_start if algo_pair.accuracy_function: if hasattr(cuml_model, 'predict'): y_pred_cuml = cuml_model.predict(X_test) else: y_pred_cuml = cuml_model.transform(X_test) cuml_accuracy = algo_pair.accuracy_function( y_test, np.asarray(y_pred_cuml)) else: cuml_accuracy = 0.0 cpu_accuracy = 0.0 if run_cpu and algo_pair.cpu_class is not None: cpu_start = time.time() cpu_model = algo_pair.run_cpu(data, **param_overrides) cpu_elapsed = time.time() - cpu_start if algo_pair.accuracy_function: if hasattr(cpu_model, 'predict'): y_pred_cpu = cpu_model.predict(X_test) else: y_pred_cpu = cpu_model.transform(X_test) cpu_accuracy = algo_pair.accuracy_function( y_test, np.asarray(y_pred_cpu)) else: cpu_elapsed = 0.0 return dict(cu_time=cu_elapsed, cpu_time=cpu_elapsed, cuml_acc=cuml_accuracy, cpu_acc=cpu_accuracy, speedup=cpu_elapsed / float(cu_elapsed), n_samples=n_samples, n_features=n_features, **param_overrides, **cuml_param_overrides)