예제 #1
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
예제 #2
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    blksz = 512
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
    qrng = curand.QRNG(curand.QRNG.SOBOL32, stream=stream)

    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    d_seed = cuda.device_array(n, dtype=np.uint32, stream=stream)

    prng.normal(d_normdist, 0, 1)
    qrng.generate(d_seed)

    d_paths = cuda.to_device(paths, stream=stream)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    griddim = gridsz, 1
    blockdim = blksz, 1, 1
    cu_monte_carlo_pricer[griddim, blockdim, stream](d_paths, dt, c0, c1,
                                                     d_normdist, d_seed)

    d_paths.to_host(stream)

    stream.synchronize()
예제 #3
0
def get_cuda_randoms(x, y):
    rand = np.empty((x * y), np.float64)
    # rand serves as a container for the randoms
    # CUDA only fills 1-dimensional arrays
    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)
    # the argument sets the random number algorithm
    prng.normal(rand, 0, 1)  # filling the container
    rand = rand.reshape(x, y)
    # to be "fair", we reshape rand to 2 dimensions
    return rand
예제 #4
0
def spca_simpler(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    numSamples = int(math.ceil((4. / epsilon)**d))
    print(numSamples)
    ##actual algorithm
    opt_x = np.zeros((p, 1))
    opt_v = -np.inf

    # Prepare CUDA
    prng = curand.PRNG()
    custr = cuda.stream()

    #GENERATE ALL RANDOM SAMPLES BEFORE
    # C = np.random.randn(d, numSamples).astype(float_dtype)
    C = np.empty((d, numSamples), dtype=float_dtype)
    prng.normal(C.ravel(), mean=0, sigma=1)

    sorter = RadixSort(maxcount=Vd.shape[0],
                       dtype=Vd.dtype,
                       stream=custr,
                       descending=True)

    for i in range(1, numSamples + 1):

        #c = np.random.randn(d,1)
        #c = C[:,i-1]
        c = C[:, i - 1:i]
        c = c / np.linalg.norm(c)
        a = Vd.dot(c)

        #partial argsort in numpy?
        #if partial, kth largest is p-k th smallest
        #but need indices more than partial

        # I = np.argsort(a, axis=0)
        # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest

        # I = sorter.argselect(a[:, 0], k=k, reverse=True)
        I = sorter.argselect(k, a[:, 0])

        val = np.linalg.norm(a[:k])  #index to get k largest

        if val > opt_v:
            opt_v = val
            opt_x = np.zeros((p, 1), dtype=float_dtype)
            opt_x[I] = a[:k] / val

    return opt_x
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = curand.PRNG(curand.PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths
	def __get_cuda_randoms(self):
	    
	    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)
	    prng.normal(self.container,0,1)
예제 #7
0
def main(*args):
    OPT_N = 4000000
    iterations = 10

    if len(args) >= 2:
        iterations = int(args[0])

    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1

    # Use cuRand to generate random numbers directyl on the gpu
    # to avoid memory transfers.
    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)

    time0 = time.time()

    # malloc
    d_stockPrice = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionStrike = cuda.device_array(shape=(OPT_N), dtype=np.float32)
    d_optionYears = cuda.device_array(shape=(OPT_N), dtype=np.float32)

    # Base distribution
    prng.uniform(d_stockPrice)
    prng.uniform(d_optionStrike)
    prng.uniform(d_optionYears)

    stream = cuda.stream()

    cfg_distribute = c_distribute[griddim, blockdim, stream]

    cfg_distribute(d_stockPrice, 5.0, 30.0)
    cfg_distribute(d_optionStrike, 1.0, 100.0)
    cfg_distribute(d_optionYears, 0.25, 10.)

    stream.synchronize()

    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)

    time1 = time.time()

    # Preconfigure the kernel as it's called multiple times in a loop.
    cfg_black_scholes_cuda = black_scholes_cuda[griddim, blockdim, stream]

    for i in range(iterations):
        cfg_black_scholes_cuda(d_callResult, d_putResult, d_stockPrice,
                               d_optionStrike, d_optionYears, RISKFREE,
                               VOLATILITY)

        d_callResult.to_host(stream)
        d_putResult.to_host(stream)

        stream.synchronize()

    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)

    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))
def get_cuda_randoms(x, y):
    rand = np.empty((x * y), np.float64)
    prng = curand.PRNG(rndtype=curand.PRNG.XORWOW)
    prng.normal(rand, 0, 1) # filling the container
    rand = rand.reshape((x, y))
    return rand
예제 #9
0
def spca_full(Vd, epsilon=0.1, d=3, k=10):
    p = Vd.shape[0]
    initNumSamples = int(math.ceil((4. / epsilon)**d))
    print(initNumSamples)
    maxSize = 6400

    ##actual algorithm
    opt_x = np.zeros((p, 1), dtype=float_dtype)
    opt_v = -np.inf

    # Send Vd to GPU
    dVd = cuda.to_device(Vd)

    remaining = initNumSamples

    custr = cuda.stream()

    # sorter = RadixSort(maxcount=Vd.shape[0], dtype=Vd.dtype, stream=custr,
    #                    descending=True)

    prng = curand.PRNG(stream=custr)
    while remaining:
        numSamples = min(remaining, maxSize)
        remaining -= numSamples

        # Prepare storage for vector A
        # print(Vd.dtype)
        # print('dA', (Vd.shape[0], numSamples))
        # print('dI', (k, numSamples))

        dA = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               order='F',
                               dtype=Vd.dtype)
        dI = cuda.device_array(shape=(Vd.shape[0], numSamples),
                               dtype=np.uint32,
                               order='F')
        daInorm = cuda.device_array(shape=numSamples, dtype=Vd.dtype)
        dC = cuda.device_array(shape=(d, numSamples),
                               order='F',
                               dtype=Vd.dtype)

        #GENERATE ALL RANDOM SAMPLES BEFORE
        # Also do normalization on the device
        prng.normal(dC.reshape(dC.size), mean=0, sigma=1)

        norm_random_nums[calc_ncta1d(dC.shape[1], 512), 512, custr](dC, d)
        #C = dC.copy_to_host()

        # Replaces: a = Vd.dot(c)
        # XXX: Vd.shape[0] must be within compute capability requirement
        # Note: this kernel can be easily scaled due to the use of num of samples
        #       as the ncta
        batch_matmul[numSamples, 512, custr](dVd, dC, dA)

        # Replaces: I = np.argsort(a, axis=0)
        # Note: the k-selection is dominanting the time
        nn = Vd.shape[0]
        segments = (np.arange(numSamples - 1, dtype=np.int32) + 1) * nn
        blksz = 32
        init_indices[(divup(dI.shape[0], blksz), divup(dI.shape[1], blksz)),
                     (blksz, blksz), custr](dI)
        segmented_sort(dA, dI, segments, stream=custr)

        # async_dA = dA.bind(custr)
        # async_dI = dI.bind(custr)

        # selnext = sorter.batch_argselect(dtype=dA.dtype,
        #                                  count=dA.shape[0],
        #                                  k=k,
        #                                  reverse=True)
        # for i in range(numSamples):
        #     dIi = selnext(async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # for i in range(numSamples):
        #     # radix_argselect(async_dA[:, i], k=k, stream=custr,
        #     #                 storeidx=async_dI[:, i])
        #     dIi = sorter.argselect(k, async_dA[:, i])
        #     async_dI[:, i].copy_to_device(dIi, stream=custr)

        # Replaces: val = np.linalg.norm(a[I[-k:]])
        # batch_scatter_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, dI,
        #                                                              daInorm)

        dA = dA.bind(custr)[-k:]
        dI = dI.bind(custr)[-k:]
        batch_norm[calc_ncta1d(numSamples, 512), 512, custr](dA, daInorm, k)

        aInorm = daInorm.copy_to_host(stream=custr)

        custr.synchronize()

        for i in xrange(numSamples):
            val = aInorm[i]
            if val > opt_v:
                opt_v = val
                opt_x.fill(0)

                # Only copy what we need
                Ik = dI[:, i].copy_to_host()
                aIk = dA[:, i].copy_to_host().reshape(k, 1)
                opt_x[Ik] = (aIk / val)

        # Free allocations
        del dA, dI, daInorm, dC

    return opt_x
예제 #10
0
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2

    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [
        int(math.ceil(float(partlen) / blksz)) for partlen in partlens
    ]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [
        curand.PRNG(curand.PRNG.MRG32K3A, stream=strm) for strm in strmlist
    ]

    # Allocate device side array
    d_normlist = [
        cuda.device_array(partlen, dtype=np.double, stream=strm)
        for partlen, strm in zip(partlens, strmlist)
    ]

    c0 = interest - 0.5 * volatility**2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [
        cu_step[gridsz, blksz, strm]
        for gridsz, strm in zip(gridszlist, strmlist)
    ]

    d_lastlist = [
        cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
        for (s, e), strm in zip(partitions, strmlist)
    ]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [
            cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm))
            for (s, e), strm in zip(partitions, strmlist)
        ]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist,
                                            d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()