Exemplos de sync em Python, exemplos de cocos.device.sync em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: monte_carlo_pi_multi_gpu_example.py Projeto: actuarial-tools/cocos

def single_gpu_benchmark(n: int, batches: int, repetitions: int = 1) -> float:
    with Timer() as timer:
        for _ in range(repetitions):
            estimate_pi(n, batches=batches, gpu=True)
            sync()

    return timer.elapsed / repetitions

Exemplo n.º 2

0

Exibir arquivo

def multi_gpu_benchmark(n: int,
                        batches: int,
                        compute_device_pool: ComputeDevicePool,
                        repetitions: int = 1,
                        verbose: bool = False) -> tp.Dict[int, float]:
    if verbose:
        print('multi gpu benchmark - begin')

    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, compute_device_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = compute_device_pool.map_reduce(lambda: estimate_pi(n=math.ceil(n / number_of_devices_to_use), batches=batches, gpu=True),
                                                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                                                    initial_value=0.0,
                                                    number_of_batches=number_of_devices_to_use)

                sync()
                if verbose:
                    print(pi)

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    if verbose:
        print('multi gpu benchmark - end')

    return number_of_devices_to_runtime_map

Exemplo n.º 3

0

Exibir arquivo

def single_gpu_benchmark(n: int, batches: int, repetitions: int = 1) -> float:
    a_complete, b_complete, c_complete = generate_data(n, gpu=True)

    with Timer() as timer:
        for _ in range(repetitions):
            process_data(a_complete, b_complete, c_complete, gpu=True)
            sync()

    return timer.elapsed / repetitions

Exemplo n.º 4

0

Exibir arquivo

 def synced_f(*args, **kwargs) -> ResultType:
     if host_to_device_transfer_function is not None:
         args, kwargs = host_to_device_transfer_function(*args, **kwargs)
     sync()
     result = f(*args, **kwargs)
     if device_to_host_transfer_function is not None:
         result = device_to_host_transfer_function(result)
     sync()
     return result

Exemplo n.º 5

0

Exibir arquivo

Arquivo: kde.py Projeto: actuarial-tools/cocos

def gaussian_kernel_estimate_vectorized_whitened(whitening: NumericArray,
                                                 whitened_points: NumericArray,
                                                 values: NumericArray,
                                                 xi: NumericArray, norm: float,
                                                 dtype: np.generic,
                                                 gpu: bool) -> NumericArray:
    # print(f'whitened_points.shape = {whitened_points.shape}')
    # print(f'values.shape = {xi.shape}')
    # print(f'xi.shape = {xi.shape}')

    n, m, d = \
        _verify_and_get_shape_of_datapoints_datavalues_and_evaluation_points(points=whitened_points,
                                                                             values=values,
                                                                             xi=xi)
    whitened_points, values, xi, whitening = \
        ensure_consistent_numeric_arrays((whitened_points, values, xi, whitening), gpu)

    num_pack = select_num_pack(gpu)

    whitened_points = whitened_points.astype(dtype, copy=False)
    whitened_xi = num_pack.dot(xi, whitening).astype(dtype, copy=False)
    values = values.astype(dtype, copy=False)

    # Create the result array and evaluate the weighted sum
    whitened_points = whitened_points.reshape((n, 1, d))
    whitened_xi = whitened_xi.reshape((1, m, d))
    residual = whitened_points - whitened_xi
    arg = residual * residual
    del residual
    if d > 1:
        assert arg.shape == (n, m, d)
        arg = num_pack.sum(arg, axis=2)
    else:
        arg = arg.reshape((n, m))
    # print(arg.shape)
    if not gpu:
        assert arg.shape == (n, m)
    arg = num_pack.exp(-0.5 * arg) * norm
    if not gpu:
        assert arg.shape == (n, m)

    # estimate = num_pack.dot(arg.T, values)
    estimate = (values * arg).sum(axis=0)
    if estimate.ndim > 1:
        estimate = estimate.squeeze()

    if gpu:
        cd.sync()

    return estimate

Exemplo n.º 6

0

Exibir arquivo

def single_gpu_benchmark(n: int,
                         batches: int,
                         repetitions: int = 1,
                         verbose: bool = False) -> float:
    if verbose:
        print('single gpu benchmark - begin')

    with Timer() as timer:
        for _ in range(repetitions):
            pi = estimate_pi(n, batches=batches, gpu=True)
            sync()
            if verbose:
                print(pi)

    if verbose:
        print('single gpu benchmark - end')

    return timer.elapsed / repetitions

Exemplo n.º 7

0

Exibir arquivo

Arquivo: map_reduce.py Projeto: actuarial-tools/cocos

def map_reduce_single_gpu(
        f: tp.Callable[..., ResultType],
        reduction: tp.Callable[[ResultType, ResultType], ResultType],
        initial_value: tp.Optional[ResultType] = None,
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:

    if number_of_batches is None:
        if args_list is not None:
            number_of_batches = len(args_list)
        elif kwargs_list is not None:
            number_of_batches = len(kwargs_list)
        else:
            raise ValueError('Number_of_batches must be defined if '
                             'both args_list and kwargs_list are empty')

    if args_list is None:
        args_list = number_of_batches * [list()]
    if kwargs_list is None:
        kwargs_list = number_of_batches * [dict()]

    result = initial_value

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        new_part = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            result = device_to_host_transfer_function(result)
        sync()

        result = reduce_with_none(result, new_part, reduction)

    return result

Exemplo n.º 8

0

Exibir arquivo

def batched_single_gpu_benchmark(n: int,
                                 batches: int,
                                 repetitions: int = 1) -> float:
    a_complete, b_complete, c_complete = generate_data(n, gpu=False)

    with Timer() as timer:
        for _ in range(repetitions):
            kwargs_list = split_arrays(a=a_complete,
                                       b=b_complete,
                                       c=c_complete,
                                       number_of_batches=batches)

            result = \
                map_reduce_single_gpu(f=lambda a, b, c: process_data(a, b, c, gpu=True),
                                      reduction=lambda x, y: numpy.hstack((x, y)),
                                      host_to_device_transfer_function=host_to_device_transfer_function,
                                      device_to_host_transfer_function=device_to_host_transfer_function,
                                      kwargs_list=kwargs_list)

            sync()

    return timer.elapsed / repetitions

Exemplo n.º 9

0

Exibir arquivo

Arquivo: monte_carlo_pi_multi_gpu_example.py Projeto: actuarial-tools/cocos

def multi_gpu_benchmark(n: int,
                        batches: int,
                        gpu_pool: ComputeDevicePool,
                        repetitions: int = 1) -> tp.Dict[int, float]:
    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, gpu_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = gpu_pool.map_reduce(
                    lambda: estimate_pi(n=math.ceil(n /
                                                    number_of_devices_to_use),
                                        batches=batches,
                                        gpu=True),
                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                    initial_value=0.0,
                    number_of_batches=number_of_devices_to_use)

                sync()

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    return number_of_devices_to_runtime_map

Exemplo n.º 10

0

Exibir arquivo

tic = time.time()
# jacobian_f_numeric_gpu = jacobian_f_lambdified.evaluate(X_gpu, t=1.0)
# jacobian_f_numeric_gpu = \
#     (jacobian_f_lambdified
#      .evaluate_with_dictionary(
#         symbolic_to_numeric_parameter_map={x1: X_gpu[:, 0],
#                                            x2: X_gpu[:, 1],
#                                            x3: X_gpu[:, 2]},
#         t=1.0))
jacobian_f_numeric_gpu = \
    (jacobian_f_lambdified
     .evaluate_with_kwargs(x1=X_gpu[:, 0],
                           x2=X_gpu[:, 1],
                           x3=X_gpu[:, 2],
                           t=1.0))
cd.sync()
time_gpu = time.time() - tic
print(f'time on gpu: {time_gpu}')

tic = time.time()
jacobian_f_numeric_gpu_direct = \
    jacobian_direct(t=1.0,
                    x1=X_gpu[:, 0],
                    x2=X_gpu[:, 1],
                    x3=X_gpu[:, 2])
cd.sync()
time_gpu_direct = time.time() - tic
print(f'time for direct computation on gpu: {time_gpu_direct}')

print(f'numerical results from gpu match results from direct computation: '
      f'{np.allclose(jacobian_f_numeric_gpu_direct, jacobian_f_numeric_gpu)}')

Exemplo n.º 11

0

Exibir arquivo

 def synchronize(cls):
     from cocos.device import sync
     sync()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: single_gpu_batch_processing.py Projeto: eeeedgar/model_1

def map_combine_single_gpu(
        f: tp.Callable[..., ResultType],
        combination: tp.Callable[[tp.Iterable[ResultType]], ResultType],
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:
    """
    This function evaluates the function `f` on elements of `args_list` and 
    `kwargs_list` sequentially on a single device and aggregates results 
    in a single step by calling the function `combination` with a list of all 
    results. Results provided to `combination` are in the same order as 
    they appear in `args_list` and `kwargs_list`. 
    
    Input data to the function f must initially reside in host memory and 
    the user must provide functions 'host_to_device_transfer_function' and 
    'device_to_host_transfer_function' to transfer the data to and results 
    from device memory respectively.
    
    If the arguments for each run of 'f' are identical and they have already 
    been applied to the function that is passed then 'args_list' and 
    'kwargs_list' may both be None but the argument 'number_of_batches' must 
    be specified so the method knows how many times to run the function 'f'.
    
    Args:
        f: The map function to be evaluated over elements of 'args_list' and 
           'kwargs_list'.
           
        combination: 
            A function that aggregates a list of all results in a single step
            
        host_to_device_transfer_function: 
            A function that transfers elements of args_list and kwargs_list 
            from host memory to device memory.
            
        device_to_host_transfer_function:
             A function that transfers results from device to host memory.
             
        args_list: A sequence of sequences of positional arguments.
        kwargs_list: A sequence of dictionaries of keyword arguments.
        number_of_batches: 
            The number of function evaluations is required if 'args_list' 
            and 'kwargs_list' are both empty.
    """
    args_list, kwargs_list, number_of_batches = \
        _extract_arguments_and_number_of_batches(
            args_list=args_list,
            kwargs_list=kwargs_list,
            number_of_batches=number_of_batches)

    results = []

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        result = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            result = device_to_host_transfer_function(result)
        sync()
        results.append(result)

    return combination(results)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: single_gpu_batch_processing.py Projeto: eeeedgar/model_1

def map_reduce_single_gpu(
        f: tp.Callable[..., ResultType],
        reduction: tp.Callable[[ResultType, ResultType], ResultType],
        initial_value: ResultType,
        host_to_device_transfer_function:
        tp.Optional[ParameterTransferFunction] = None,
        device_to_host_transfer_function:
        tp.Optional[tp.Callable[[ResultType], ResultType]] = None,
        args_list: tp.Optional[tp.Sequence[tp.Sequence]] = None,
        kwargs_list: tp.Optional[tp.Sequence[tp.Dict[str, tp.Any]]] = None,
        number_of_batches: tp.Optional[int] = None) \
        -> ResultType:
    """
    This method function the function 'f' on elements of 'args_list' and 
    'kwargs_list' sequentially on a single device and performs the reduction 
    by calling the function 'reduction' on the result and the result of the 
    reductions so far to eventually produce one final result of type 
    'ResultType'. The reduce step is performed from the left and results are 
    being processed in the same order as they appear in `args_list` and 
    `kwargs_list`. 

    Input data to the function f must initially reside in host memory and 
    the user must provide functions 'host_to_device_transfer_function' and 
    'device_to_host_transfer_function' to transfer the data to and results 
    from device memory respectively.

    If the arguments for each run of 'f' are identical and they have already 
    been applied to the function that is passed then 'args_list' and 
    'kwargs_list' may both be None but the argument 'number_of_batches' must 
    be specified so the method knows how many times to run the function 'f'.
    
    Args:
        f: The map function to be evaluated over elements of 'args_list' and 
           'kwargs_list'.
           
        reduction: The reduction to be performed on the results of 'f'. 
                   This is done on the host (not the device).
                   
        initial_value: The initial value of the reduction 
                       (i.e. the neutral element).
                       
        host_to_device_transfer_function: 
            A function that transfers elements of args_list and kwargs_list 
            from host memory to device memory.
            
        device_to_host_transfer_function: 
            A function that transfers results from device to host memory.
            
        args_list: A sequence of sequences of positional arguments.
        kwargs_list: A sequence of dictionaries of keyword arguments.
        number_of_batches: 
            The number of function evaluations is required if 'args_list' 
            and 'kwargs_list' are both empty.
    """
    args_list, kwargs_list, number_of_batches = \
        _extract_arguments_and_number_of_batches(
            args_list=args_list,
            kwargs_list=kwargs_list,
            number_of_batches=number_of_batches)

    result = initial_value

    for args, kwargs in zip(args_list, kwargs_list):
        if host_to_device_transfer_function is not None:
            args, kwargs = host_to_device_transfer_function(*args, **kwargs)
        sync()
        new_part = f(*args, **kwargs)
        if device_to_host_transfer_function is not None:
            new_part = device_to_host_transfer_function(new_part)
        sync()

        result = reduction(result, new_part)

    return result

Exemplo n.º 14

0

Exibir arquivo

             v_bar=v_bar,
             T=T,
             K=K,
             nT=nT,
             R=R)

    number_of_devices_to_runtime_map = {}

    for i in range(1, gpu_pool.number_of_devices + 1):
        print(f'computing on {i} GPUs')
        tic = time.time()
        option_price = \
            simulate_and_compute_option_price_gpu(gpu_pool=gpu_pool,
                                                  number_of_batches=i,
                                                  **kwargs)
        sync()
        gpu_time = time.time() - tic
        print(f'option price = {option_price} computed on {i} GPUs in '
              f'{gpu_time} seconds')

        number_of_devices_to_runtime_map[i] = gpu_time

    if gpu_pool.number_of_devices > 1:
        for i in range(2, gpu_pool.number_of_devices + 1):
            print(
                f'Performance on {i} GPUs increased by a factor of'
                f' {number_of_devices_to_runtime_map[1] / number_of_devices_to_runtime_map[i]} '
                f'over a single GPU.')

    result_table = create_result_table(number_of_devices_to_runtime_map)
    print(result_table)