def _run_one_size(self, algo, n_samples, n_features, param_overrides={}, dataset_param_overrides={}): data = gen_data(self.dataset_name, n_samples, n_features, **dataset_param_overrides) # sklearn algo.reload() skl_timer = BenchmarkTimer(self.n_reps) for rep in skl_timer.benchmark_runs(): algo.run(data, **param_overrides) skl_elapsed = np.min(skl_timer.timings) # patched sklearn d4p.patch_sklearn() algo.reload() d4p_timer = BenchmarkTimer(self.n_reps) for rep in d4p_timer.benchmark_runs(): algo.run(data, **param_overrides) d4p_elapsed = np.min(d4p_timer.timings) d4p.unpatch_sklearn() speedup = skl_elapsed / d4p_elapsed print(f"{algo.name} (n_samples={n_samples}, n_features={n_features})" f" [skl={skl_elapsed}, d4p={d4p_elapsed} speedup={speedup}]") return dict( skl_time=skl_elapsed, d4p_time=d4p_elapsed, speedup=speedup, n_samples=n_samples, n_features=n_features, **param_overrides, **dataset_param_overrides )
def parse_args(parser, size=None, loop_types=(), n_jobs_supported=True, prefix='sklearn'): ''' Add common arguments useful for most benchmarks and parse. Parameters ---------- parser : argparse.ArgumentParser Parser to which the arguments should be added. size : tuple of int, optional Enable '--size' argument with this default size. If None (default), no '--size' argument will be added. loop_types : iterable of str, optional Add arguments like '--fit-inner-loops' and '--fit-outer-loops', useful for tweaking runtime of the benchmark. n_jobs_supported : bool If set to True, generate a n_jobs member in the argparse Namespace corresponding to the optimal n_jobs parameter for scikit-learn. Otherwise, n_jobs will be set to None. prefix : str, optional, default 'sklearn' The default prefix to report Returns ------- parser : argparse.ArgumentParser Parser to which the arguments were added. This is the same parser that was passed to this function. ''' parser.add_argument('-n', '--num-threads', '--core-number', default=-1, dest='threads', type=int, help='Number of threads to use') parser.add_argument('-a', '--arch', default='?', help='Machine architecture, for bookkeeping') parser.add_argument('-b', '--batch', '--batchID', default='?', help='Batch ID, for bookkeeping') parser.add_argument('-p', '--prefix', default=prefix, help='Prefix string, for bookkeeping') parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Output extra debug messages') parser.add_argument('--data-format', type=str, default='numpy', choices=('numpy', 'pandas', 'cudf'), help='Data format: numpy (default), pandas, cudf') parser.add_argument('--data-order', type=str, default='C', choices=('C', 'F'), help='Data order: C (row-major, default) or' 'F (column-major)') parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64, choices=(np.float32, np.float64), help='Data type: float64 (default) or float32') parser.add_argument('--check-finiteness', default=False, action='store_true', help='Check finiteness in sklearn input check' '(disabled by default)') parser.add_argument('--output-format', type=str, default='json', choices=('json'), help='Output format: json') parser.add_argument('--time-method', type=str, default='box_filter', choices=('box_filter'), help='Method used for time mesurements') parser.add_argument('--box-filter-measurements', type=int, default=100, help='Maximum number of measurements in box filter') parser.add_argument('--inner-loops', default=100, type=int, help='Maximum inner loop iterations ' '(we take the mean over inner iterations)') parser.add_argument('--outer-loops', default=100, type=int, help='Maximum outer loop iterations ' '(we take the min over outer iterations)') parser.add_argument('--time-limit', default=10., type=float, help='Target time to spend to benchmark') parser.add_argument('--goal-outer-loops', default=10, type=int, dest='goal', help='Number of outer loops to aim ' 'while automatically picking number of ' 'inner loops. If zero, do not automatically ' 'decide number of inner loops.') parser.add_argument('--seed', type=int, default=12345, help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' 'Now avalible for scikit-learn benchmarks'), parser.add_argument('--device', default='None', type=str, choices=('host', 'cpu', 'gpu', 'None'), help='Execution context device') for data in ['X', 'y']: for stage in ['train', 'test']: parser.add_argument(f'--file-{data}-{stage}', type=argparse.FileType('r'), help=f'Input file with {data}_{stage},' 'in NPY format') if size is not None: parser.add_argument('-s', '--size', default=size, type=_parse_size, dest='shape', help='Problem size, delimited by "x" or ","') params = parser.parse_args() if not params.no_intel_optimized: try: from daal4py.sklearn import patch_sklearn patch_sklearn() except ImportError: print( 'Failed to import daal4py.sklearn.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) params.device = 'None' else: if params.device != 'None': print( 'Device context is not supported for stock scikit-learn.' 'Please use --no-intel-optimized=False with' f'--device={params.device} parameter. Fallback to --device=None.', file=sys.stderr) params.device = 'None' # disable finiteness check (default) if not params.check_finiteness: sklearn_disable_finiteness_check() # Ask DAAL what it thinks about this number of threads num_threads = prepare_daal_threads(num_threads=params.threads) if params.verbose: print(f'@ DAAL gave us {num_threads} threads') n_jobs = None if n_jobs_supported: n_jobs = num_threads = params.threads # Set threading and DAAL related params here setattr(params, 'threads', num_threads) setattr(params, 'n_jobs', n_jobs) # Set size string parameter for easy printing if size is not None: setattr(params, 'size', size_str(params.shape)) # Very verbose output if params.verbose: print(f'@ params = {params.__dict__}') return params
import numpy as np from daal4py.sklearn import patch_sklearn patch_sklearn() from sklearn.linear_model import LinearRegression from timeit import default_timer as timer from sklearn.metrics import mean_squared_error import pandas as pd import common NUM_LOOPS = 100 print("Computing for Linear Regression with Daal Patch") def run_inference(num_observations: int = 1000): """Run xgboost for specified number of observations""" # Load data train_x_df = common.get_test_data_df(X=common.X_df, size=num_observations) train_y = common.get_test_data_y(size=num_observations) num_rows = len(train_x_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() reg = LinearRegression().fit(train_x_df, train_y) #predictor.compute(data, MODEL) end_time = timer()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import time import modin.pandas as pd from modin.experimental.engines.omnisci_on_ray.frame.omnisci_worker import OmnisciServer from sklearn import config_context import daal4py.sklearn as sklearn sklearn.patch_sklearn() from sklearn.model_selection import train_test_split import sklearn.linear_model as lm import numpy as np def read(): columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ",