def single_gpu_benchmark(n: int, batches: int, repetitions: int = 1) -> float:
    with Timer() as timer:
        for _ in range(repetitions):
            estimate_pi(n, batches=batches, gpu=True)
            sync()

    return timer.elapsed / repetitions
Exemplo n.º 2
0
def multi_gpu_benchmark(n: int,
                        batches: int,
                        compute_device_pool: ComputeDevicePool,
                        repetitions: int = 1,
                        verbose: bool = False) -> tp.Dict[int, float]:
    if verbose:
        print('multi gpu benchmark - begin')

    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, compute_device_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = compute_device_pool.map_reduce(lambda: estimate_pi(n=math.ceil(n / number_of_devices_to_use), batches=batches, gpu=True),
                                                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                                                    initial_value=0.0,
                                                    number_of_batches=number_of_devices_to_use)

                sync()
                if verbose:
                    print(pi)

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    if verbose:
        print('multi gpu benchmark - end')

    return number_of_devices_to_runtime_map
Exemplo n.º 3
0
def single_gpu_benchmark(n: int, batches: int, repetitions: int = 1) -> float:
    a_complete, b_complete, c_complete = generate_data(n, gpu=True)

    with Timer() as timer:
        for _ in range(repetitions):
            process_data(a_complete, b_complete, c_complete, gpu=True)
            sync()

    return timer.elapsed / repetitions
Exemplo n.º 4
0
 def synced_f(*args, **kwargs) -> ResultType:
     if host_to_device_transfer_function is not None:
         args, kwargs = host_to_device_transfer_function(*args, **kwargs)
     sync()
     result = f(*args, **kwargs)
     if device_to_host_transfer_function is not None:
         result = device_to_host_transfer_function(result)
     sync()
     return result
Exemplo n.º 5
0
def gaussian_kernel_estimate_vectorized_whitened(whitening: NumericArray,
                                                 whitened_points: NumericArray,
                                                 values: NumericArray,
                                                 xi: NumericArray, norm: float,
                                                 dtype: np.generic,
                                                 gpu: bool) -> NumericArray:
    # print(f'whitened_points.shape = {whitened_points.shape}')
    # print(f'values.shape = {xi.shape}')
    # print(f'xi.shape = {xi.shape}')

    n, m, d = \
        _verify_and_get_shape_of_datapoints_datavalues_and_evaluation_points(points=whitened_points,
                                                                             values=values,
                                                                             xi=xi)
    whitened_points, values, xi, whitening = \
        ensure_consistent_numeric_arrays((whitened_points, values, xi, whitening), gpu)

    num_pack = select_num_pack(gpu)

    whitened_points = whitened_points.astype(dtype, copy=False)
    whitened_xi = num_pack.dot(xi, whitening).astype(dtype, copy=False)
    values = values.astype(dtype, copy=False)

    # Create the result array and evaluate the weighted sum
    whitened_points = whitened_points.reshape((n, 1, d))
    whitened_xi = whitened_xi.reshape((1, m, d))
    residual = whitened_points - whitened_xi
    arg = residual * residual
    del residual
    if d > 1:
        assert arg.shape == (n, m, d)
        arg = num_pack.sum(arg, axis=2)
    else:
        arg = arg.reshape((n, m))
    # print(arg.shape)
    if not gpu:
        assert arg.shape == (n, m)
    arg = num_pack.exp(-0.5 * arg) * norm
    if not gpu:
        assert arg.shape == (n, m)

    # estimate = num_pack.dot(arg.T, values)
    estimate = (values * arg).sum(axis=0)
    if estimate.ndim > 1:
        estimate = estimate.squeeze()

    if gpu:
        cd.sync()

    return estimate
Exemplo n.º 6
0
def single_gpu_benchmark(n: int,
                         batches: int,
                         repetitions: int = 1,
                         verbose: bool = False) -> float:
    if verbose:
        print('single gpu benchmark - begin')

    with Timer() as timer:
        for _ in range(repetitions):
            pi = estimate_pi(n, batches=batches, gpu=True)
            sync()
            if verbose:
                print(pi)

    if verbose:
        print('single gpu benchmark - end')

    return timer.elapsed / repetitions
Exemplo n.º 7
0
def map_reduce_single_gpu(
        f: tp.Callable[..., ResultType],
        reduction: tp.Callable[[ResultType, ResultType], ResultType],
        initial_value: tp.Optional[ResultType] = None,
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:

    if number_of_batches is None:
        if args_list is not None:
            number_of_batches = len(args_list)
        elif kwargs_list is not None:
            number_of_batches = len(kwargs_list)
        else:
            raise ValueError('Number_of_batches must be defined if '
                             'both args_list and kwargs_list are empty')

    if args_list is None:
        args_list = number_of_batches * [list()]
    if kwargs_list is None:
        kwargs_list = number_of_batches * [dict()]

    result = initial_value

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        new_part = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            result = device_to_host_transfer_function(result)
        sync()

        result = reduce_with_none(result, new_part, reduction)

    return result
Exemplo n.º 8
0
def batched_single_gpu_benchmark(n: int,
                                 batches: int,
                                 repetitions: int = 1) -> float:
    a_complete, b_complete, c_complete = generate_data(n, gpu=False)

    with Timer() as timer:
        for _ in range(repetitions):
            kwargs_list = split_arrays(a=a_complete,
                                       b=b_complete,
                                       c=c_complete,
                                       number_of_batches=batches)

            result = \
                map_reduce_single_gpu(f=lambda a, b, c: process_data(a, b, c, gpu=True),
                                      reduction=lambda x, y: numpy.hstack((x, y)),
                                      host_to_device_transfer_function=host_to_device_transfer_function,
                                      device_to_host_transfer_function=device_to_host_transfer_function,
                                      kwargs_list=kwargs_list)

            sync()

    return timer.elapsed / repetitions
def multi_gpu_benchmark(n: int,
                        batches: int,
                        gpu_pool: ComputeDevicePool,
                        repetitions: int = 1) -> tp.Dict[int, float]:
    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, gpu_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = gpu_pool.map_reduce(
                    lambda: estimate_pi(n=math.ceil(n /
                                                    number_of_devices_to_use),
                                        batches=batches,
                                        gpu=True),
                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                    initial_value=0.0,
                    number_of_batches=number_of_devices_to_use)

                sync()

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    return number_of_devices_to_runtime_map
Exemplo n.º 10
0
tic = time.time()
# jacobian_f_numeric_gpu = jacobian_f_lambdified.evaluate(X_gpu, t=1.0)
# jacobian_f_numeric_gpu = \
#     (jacobian_f_lambdified
#      .evaluate_with_dictionary(
#         symbolic_to_numeric_parameter_map={x1: X_gpu[:, 0],
#                                            x2: X_gpu[:, 1],
#                                            x3: X_gpu[:, 2]},
#         t=1.0))
jacobian_f_numeric_gpu = \
    (jacobian_f_lambdified
     .evaluate_with_kwargs(x1=X_gpu[:, 0],
                           x2=X_gpu[:, 1],
                           x3=X_gpu[:, 2],
                           t=1.0))
cd.sync()
time_gpu = time.time() - tic
print(f'time on gpu: {time_gpu}')

tic = time.time()
jacobian_f_numeric_gpu_direct = \
    jacobian_direct(t=1.0,
                    x1=X_gpu[:, 0],
                    x2=X_gpu[:, 1],
                    x3=X_gpu[:, 2])
cd.sync()
time_gpu_direct = time.time() - tic
print(f'time for direct computation on gpu: {time_gpu_direct}')

print(f'numerical results from gpu match results from direct computation: '
      f'{np.allclose(jacobian_f_numeric_gpu_direct, jacobian_f_numeric_gpu)}')
Exemplo n.º 11
0
 def synchronize(cls):
     from cocos.device import sync
     sync()
Exemplo n.º 12
0
def map_combine_single_gpu(
        f: tp.Callable[..., ResultType],
        combination: tp.Callable[[tp.Iterable[ResultType]], ResultType],
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:
    """
    This function evaluates the function `f` on elements of `args_list` and 
    `kwargs_list` sequentially on a single device and aggregates results 
    in a single step by calling the function `combination` with a list of all 
    results. Results provided to `combination` are in the same order as 
    they appear in `args_list` and `kwargs_list`. 
    
    Input data to the function f must initially reside in host memory and 
    the user must provide functions 'host_to_device_transfer_function' and 
    'device_to_host_transfer_function' to transfer the data to and results 
    from device memory respectively.
    
    If the arguments for each run of 'f' are identical and they have already 
    been applied to the function that is passed then 'args_list' and 
    'kwargs_list' may both be None but the argument 'number_of_batches' must 
    be specified so the method knows how many times to run the function 'f'.
    
    Args:
        f: The map function to be evaluated over elements of 'args_list' and 
           'kwargs_list'.
           
        combination: 
            A function that aggregates a list of all results in a single step
            
        host_to_device_transfer_function: 
            A function that transfers elements of args_list and kwargs_list 
            from host memory to device memory.
            
        device_to_host_transfer_function:
             A function that transfers results from device to host memory.
             
        args_list: A sequence of sequences of positional arguments.
        kwargs_list: A sequence of dictionaries of keyword arguments.
        number_of_batches: 
            The number of function evaluations is required if 'args_list' 
            and 'kwargs_list' are both empty.
    """
    args_list, kwargs_list, number_of_batches = \
        _extract_arguments_and_number_of_batches(
            args_list=args_list,
            kwargs_list=kwargs_list,
            number_of_batches=number_of_batches)

    results = []

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        result = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            result = device_to_host_transfer_function(result)
        sync()
        results.append(result)

    return combination(results)
Exemplo n.º 13
0
def map_reduce_single_gpu(
        f: tp.Callable[..., ResultType],
        reduction: tp.Callable[[ResultType, ResultType], ResultType],
        initial_value: ResultType,
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:
    """
    This method function the function 'f' on elements of 'args_list' and 
    'kwargs_list' sequentially on a single device and performs the reduction 
    by calling the function 'reduction' on the result and the result of the 
    reductions so far to eventually produce one final result of type 
    'ResultType'. The reduce step is performed from the left and results are 
    being processed in the same order as they appear in `args_list` and 
    `kwargs_list`. 

    Input data to the function f must initially reside in host memory and 
    the user must provide functions 'host_to_device_transfer_function' and 
    'device_to_host_transfer_function' to transfer the data to and results 
    from device memory respectively.

    If the arguments for each run of 'f' are identical and they have already 
    been applied to the function that is passed then 'args_list' and 
    'kwargs_list' may both be None but the argument 'number_of_batches' must 
    be specified so the method knows how many times to run the function 'f'.
    
    Args:
        f: The map function to be evaluated over elements of 'args_list' and 
           'kwargs_list'.
           
        reduction: The reduction to be performed on the results of 'f'. 
                   This is done on the host (not the device).
                   
        initial_value: The initial value of the reduction 
                       (i.e. the neutral element).
                       
        host_to_device_transfer_function: 
            A function that transfers elements of args_list and kwargs_list 
            from host memory to device memory.
            
        device_to_host_transfer_function: 
            A function that transfers results from device to host memory.
            
        args_list: A sequence of sequences of positional arguments.
        kwargs_list: A sequence of dictionaries of keyword arguments.
        number_of_batches: 
            The number of function evaluations is required if 'args_list' 
            and 'kwargs_list' are both empty.
    """
    args_list, kwargs_list, number_of_batches = \
        _extract_arguments_and_number_of_batches(
            args_list=args_list,
            kwargs_list=kwargs_list,
            number_of_batches=number_of_batches)

    result = initial_value

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        new_part = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            new_part = device_to_host_transfer_function(new_part)
        sync()

        result = reduction(result, new_part)

    return result
Exemplo n.º 14
0
             v_bar=v_bar,
             T=T,
             K=K,
             nT=nT,
             R=R)

    number_of_devices_to_runtime_map = {}

    for i in range(1, gpu_pool.number_of_devices + 1):
        print(f'computing on {i} GPUs')
        tic = time.time()
        option_price = \
            simulate_and_compute_option_price_gpu(gpu_pool=gpu_pool,
                                                  number_of_batches=i,
                                                  **kwargs)
        sync()
        gpu_time = time.time() - tic
        print(f'option price = {option_price} computed on {i} GPUs in '
              f'{gpu_time} seconds')

        number_of_devices_to_runtime_map[i] = gpu_time

    if gpu_pool.number_of_devices > 1:
        for i in range(2, gpu_pool.number_of_devices + 1):
            print(
                f'Performance on {i} GPUs increased by a factor of'
                f' {number_of_devices_to_runtime_map[1] / number_of_devices_to_runtime_map[i]} '
                f'over a single GPU.')

    result_table = create_result_table(number_of_devices_to_runtime_map)
    print(result_table)