def test_explicit_init(): import nums import nums.core.application_manager as am nums.init() assert am.is_initialized() am.destroy()
def nps_app_inst(request): # This triggers initialization; it's not to be mixed with the app_inst fixture. # Observed (core dumped) after updating this fixture to run functions with "serial" backend. # Last time this happened, it was due poor control over the # scope and duration of ray resources. # pylint: disable = import-outside-toplevel from nums.core import settings from nums.core import application_manager settings.system_name = request.param yield application_manager.instance() application_manager.destroy()
def test_app_manager(): for compute_name in ["numpy"]: for system_name in ["serial", "ray-cyclic", "ray-task"]: settings.compute_name = compute_name settings.system_name = system_name app: ArrayApplication = application_manager.instance() assert np.allclose( np.arange(10), app.arange(0, shape=(10, ), block_shape=(10, )).get()) application_manager.destroy() assert not application_manager.is_initialized() time.sleep(1)
def nps_app_inst(request): # This triggers initialization; it's not to be mixed with the app_inst fixture. # Observed (core dumped) after updating this fixture to run functions with "serial" backend. # Last time this happened, it was due poor control over the # scope and duration of ray resources. # pylint: disable = import-outside-toplevel from nums.core import settings from nums.core import application_manager import nums.numpy as nps settings.system_name, settings.device_grid_name = request.param # Need to reset numpy random state. # It's the only stateful numpy API object. nps.random.reset() yield application_manager.instance() application_manager.destroy()
def test_app_manager(compute_name, system_name, device_grid_name): settings.use_head = True settings.compute_name = compute_name settings.system_name = system_name settings.device_grid_name = device_grid_name app: ArrayApplication = application_manager.instance() app_arange = app.arange(0, shape=(10, ), block_shape=(10, )) assert np.allclose(np.arange(10), app_arange.get()) application_manager.destroy() assert not application_manager.is_initialized() time.sleep(1) # Revert for other tests. settings.compute_name = "numpy" settings.system_name = "ray" settings.device_grid_name = "cyclic"
def test_app_manager(compute_name, system_name, device_grid_name, num_cpus): settings.use_head = True settings.compute_name = compute_name settings.system_name = system_name settings.device_grid_name = device_grid_name settings.num_cpus = num_cpus app: ArrayApplication = application_manager.instance() print(settings.num_cpus, num_cpus, app.cm.num_cores_total()) app_arange = app.arange(0, shape=(10, ), block_shape=(10, )) assert np.allclose(np.arange(10), app_arange.get()) if num_cpus is None: assert app.cm.num_cores_total() == get_num_cores() else: assert app.cm.num_cores_total() == num_cpus application_manager.destroy() assert not application_manager.is_initialized() time.sleep(1) # Revert for other tests. settings.compute_name = "numpy" settings.system_name = "ray" settings.device_grid_name = "cyclic" settings.num_cpus = None
def benchmark_mlp(num_gpus, N_list, system_class_list, d=1000, optimizer=True, dtype=np.float32): format_string = "%20s,%10s,%10s,%10s,%10s,%10s" print(format_string % ("Library", "N", "Cost", "CostOpt", "CostInit", "CV")) global app for N in N_list: N = int(N) for system_class in system_class_list: # try: if True: if system_class in ["Cupy", "Numpy"]: name = system_class import cupy as cp arr_lib = cp if system_class == "Cupy" else np app = arr_lib X, y = np_sample(np, sample_size=N, feature=d, dtype=dtype) W_in_1, W_1_2, W_2_out = np_init_weights(np, X, y, dtype=dtype) X = cp.asarray(X) y = cp.asarray(y) W_in_1 = cp.asarray(W_in_1) W_1_2 = cp.asarray(W_1_2) W_2_out = cp.asarray(W_2_out) cp.cuda.Device(0).synchronize() # Benchmark one step mlp def func(): tic = time.time() toc_end = one_step_fit_np(arr_lib, X, y, W_in_1, W_1_2, W_2_out) cp.cuda.Device(0).synchronize() toc = time.time() return toc - tic, toc_end - tic, 0, None costs, costs_opt, costs_init = benchmark_func(func) del (X, y, W_in_1, W_1_2, W_2_out) else: # Init system name = system_class.__name__ app = am.instance(num_gpus, optimizer) # Make dataset nps.random.seed(0) X, y = sample(app, sample_size=N, feature=d, num_gpus=num_gpus, dtype=dtype) W_in_1, W_1_2, W_2_out = data_init_weights(app, X, y, verbose=False) # Benchmark one step MLP def func(): tic = time.time() if optimizer: toc_init, toc_opt = one_step_fit_opt(app, X, y, W_in_1, W_1_2, W_2_out, num_gpus, verbose=False) else: toc_init = tic toc_opt = one_step_fit(app, X, y, W_in_1, W_1_2, W_2_out) toc = time.time() return toc - tic, toc_opt - tic, toc_init - tic, None costs, costs_opt, costs_init = benchmark_func(func) del (X, y, W_in_1, W_1_2, W_2_out) am.destroy() # except Exception: else: costs = [-1] costs_opt = [-1] costs_init = [-1] log_str = format_string % ( name, "%d" % N, "%.4f" % np.mean(costs), "%.4f" % np.mean(costs_opt), "%.4f" % np.mean(costs_init), "%.2f" % (np.std(costs) / np.mean(costs)), ) print(log_str) with open("result_mlp_data.csv", "a") as f: f.write(log_str + "\n")
ind = random.sample(range(w * h), sparsity) ind = [(i % w, i // w) for i in ind] for i in ind: arr[i] = np.random.randint(0, 100) dtype = np.__getattribute__(str(arr.dtype)) shape = arr.shape app = _instance() block_shape = app.compute_block_shape(shape, dtype) sparse_result = SparseBlockArray.from_np(arr, block_shape=block_shape, copy=False, system=app.system) dense_result = BlockArray.from_np(arr, block_shape=block_shape, copy=False, system=app.system) funcs = [ lambda x: x @ x, lambda x: x + x, lambda x: x - x, # lambda x: x ** x, ] for f in funcs: assert (f(sparse_result).get() == f(dense_result).get()).all() destroy()
def benchmark_bop(num_gpus, N_list, system_class_list, d=400000, optimizer=True, dtype=np.float32): format_string = "%20s,%10s,%10s,%10s,%10s,%10s" print(format_string % ("Library", "N", "Cost", "CostOpt", "CostInit", "CV")) # global app for N in N_list: N = int(N) d1 = N d2 = d for system_class in system_class_list: # try: if True: if system_class in ["Cupy", "Numpy"]: name = system_class import cupy as cp arr_lib = cp if system_class == "Cupy" else np arr_lib.inv = arr_lib.linalg.inv app = arr_lib # X = arr_lib.ones((N, d), dtype=dtype) W = arr_lib.ones(shape=(d1, d2), dtype=dtype) D = arr_lib.ones(shape=(d2, N), dtype=dtype) # Prevent the Singular matrix Error in np.linalg.inv # arange = arr_lib.arange(N) # X[arange, arange % d] = 1 cp.cuda.Device(0).synchronize() # Benchmark bop def func(): tic = time.time() Z = W @ D # Z = X.T @ X cp.cuda.Device(0).synchronize() toc = time.time() return toc - tic, 0, 0, None costs, costs_opt, costs_init = benchmark_func(func) # del (X, app) del (W, D, app) else: # Init system name = system_class.__name__ app = am.instance(num_gpus, optimizer) W = app.ones(shape=(d1, d2), block_shape=(d1, d2 // num_gpus), dtype=dtype) D = app.ones(shape=(d2, N), block_shape=(d2 // num_gpus, N), dtype=dtype) # X = app.ones((N, d), block_shape=(N // num_gpus, d), dtype=dtype) # Benchmark bop def func(): tic = time.time() if optimizer: toc_init, toc_opt = matmul_opt(app, W, D, num_gpus) # toc_init, toc_opt = matmul_opt(app, X, num_gpus) else: Z = (W @ D).touch() # Z = (X.T @ X).touch() toc = time.time() return toc - tic, 0, 0, None costs, costs_opt, costs_init = benchmark_func(func) del (W, D) am.destroy() #except Exception: else: costs = [-1] costs_opt = [-1] costs_init = [-1] log_str = format_string % ( name, "%d" % N, "%.4f" % np.mean(costs), "%.4f" % np.mean(costs_opt), "%.4f" % np.mean(costs_init), "%.2f" % (np.std(costs) / np.mean(costs)), ) print(log_str) with open("result_bop.csv", "a") as f: f.write(log_str + "\n")