def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15): from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=seed) return make_obj_array([ rng.normal(queue, nparticles, dtype=dtype) for i in range(dims)])
def test_random(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import RanluxGenerator if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] gen = RanluxGenerator(queue, 5120) for ary_size in [300, 301, 302, 303, 10007]: for dtype in dtypes: ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) assert (0 < ran.get()).all() assert (ran.get() < 1).all() gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) assert (4 < ran.get()).all() assert (ran.get() < 7).all() ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) dtypes = [np.int32] for dtype in dtypes: ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) assert (200 <= ran.get()).all() assert (ran.get() < 300).all()
def valueMonteCarloGPU(ctx,queue,S_init,nPaths,Exp_Time, dtMonte,Strike,Int_Rate,Vol,PTYPE, nMonteLoops=1): nextStepPathKernel = ElementwiseKernel(ctx,"float *latestStep, float *ran, float Strike, float Int_Rate, float Exp_Time, float dt, float Vol","float rval = exp((Int_Rate - 0.5f * Vol*Vol)*dt + Vol * sqrt(dt) * ran[i]); latestStep[i] *= rval;","nextStepPathKernel") excersisePriceKernel = ElementwiseKernel(ctx,"float *latestStep, float Strike, float Int_Rate, float Exp_Time","float rval = (latestStep[i]-Strike); latestStep[i] = exp(-Int_Rate*Exp_Time) * max(rval,0.0f);","excersisePriceKernel") sumKernel = ReductionKernel(ctx, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x") maxWorkItems = 1*2**9 multiplier = 1 if(nPaths > maxWorkItems): multiplier = math.ceil(nPaths/maxWorkItems) nPaths = multiplier * maxWorkItems else: maxWorkItems = nPaths #print(maxWorkItems, multiplier, nPaths) nTimeStepsMonte = math.ceil(Exp_Time/dtMonte) #print(nTimeStepsMonte,nMonteLoops) #set up random number generator gen = RanluxGenerator(queue, maxWorkItems, luxury=4, seed=time.time()) #the arrays ran = cl.array.zeros(queue, maxWorkItems, numpy.float32) latestStep = cl.array.zeros_like(ran) means = numpy.zeros(nMonteLoops) theMean = 0 #the loop for loop in range(nMonteLoops): theSum = 0 for mult in range(multiplier): latestStep.fill(S_init) for t in range(nTimeStepsMonte): gen.fill_normal(ran) gen.synchronize(queue) nextStepPathKernel(latestStep, ran, Strike, Int_Rate, Exp_Time, dtMonte, Vol) excersisePriceKernel(latestStep, Strike, Int_Rate, Exp_Time) #print(latestStep) #add to array theSum += sumKernel(latestStep, queue).get() means[loop] = theSum / nPaths monteAverage = numpy.mean(means) monteStdDeviation = numpy.std(means) return monteAverage,dtMonte, monteStdDeviation
def test_random_int_in_range(ctx_factory, dtype): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import RanluxGenerator gen = RanluxGenerator(queue, 5120) if (dtype == np.int64 and context.devices[0].platform.vendor.startswith("Advanced Micro")): pytest.xfail("AMD miscompiles 64-bit RNG math") ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) assert (200 <= ran.get()).all() assert (ran.get() < 300).all()
def test_sort(ctx_factory, scan_kernel): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"], scan_kernel=scan_kernel) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time # intermediate arrays for largest size cause out-of-memory on low-end GPUs for n in scan_test_counts[:-1]: if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel): continue print(n) print(" rng") a_dev = rng.uniform(queue, (n, ), dtype=dtype, a=0, b=2**16) a = a_dev.get() dev_start = time() print(" device") (a_dev_sorted, ), evt = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end - dev_end dev_elapsed = dev_end - dev_start print(" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (1e-6 * n / dev_elapsed, 1e-6 * n / numpy_elapsed, numpy_elapsed / dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all()
def test_sort(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"]) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time # intermediate arrays for largest size cause out-of-memory on low-end GPUs for n in scan_test_counts[:-1]: print(n) print(" rng") a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2 ** 16) a = a_dev.get() dev_start = time() print(" device") (a_dev_sorted,), evt = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end - dev_end dev_elapsed = dev_end - dev_start print( " dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (1e-6 * n / dev_elapsed, 1e-6 * n / numpy_elapsed, numpy_elapsed / dev_elapsed) ) assert (a_dev_sorted.get() == a_sorted).all()
def test_sort(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"]) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time for n in scan_test_counts: print(n) print(" rng") a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16) a = a_dev.get() dev_start = time() print(" device") a_dev_sorted, = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end-dev_end dev_elapsed = dev_end-dev_start print (" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % ( 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all()
def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15): from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=seed) return make_obj_array( [rng.normal(queue, nparticles, dtype=dtype) for i in range(dims)])
# STARTEXAMPLE import pyopencl as cl import numpy as np from six.moves import range ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dims = 2 nparticles = 10**4 # ----------------------------------------------------------------------------- # generate some random particle positions # ----------------------------------------------------------------------------- from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from pytools.obj_array import make_obj_array particles = make_obj_array( [rng.normal(queue, nparticles, dtype=np.float64) for i in range(dims)]) # ----------------------------------------------------------------------------- # build tree and traversals (lists) # ----------------------------------------------------------------------------- from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, particles, max_particles_in_box=30) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx) trav, _ = tg(queue, tree)
def test_pyfmmlib_fmm(ctx_getter): logging.basicConfig(level=logging.INFO) from pytest import importorskip importorskip("pyfmmlib") ctx = ctx_getter() queue = cl.CommandQueue(ctx) nsources = 3000 ntargets = 1000 dims = 2 dtype = np.float64 helmholtz_k = 2 sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=18) + np.array([2, 0]) sources_host = particle_array_to_host(sources) targets_host = particle_array_to_host(targets) from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) trav = trav.get(queue=queue) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=20) weights = rng.uniform(queue, nsources, dtype=np.float64).get() # weights = np.ones(nsources) logger.info("computing direct (reference) result") from pyfmmlib import hpotgrad2dall_vec ref_pot, _, _ = hpotgrad2dall_vec( ifgrad=False, ifhess=False, sources=sources_host.T, charge=weights, targets=targets_host.T, zk=helmholtz_k ) from boxtree.pyfmmlib_integration import Helmholtz2DExpansionWrangler wrangler = Helmholtz2DExpansionWrangler(trav.tree, helmholtz_k, nterms=10) from boxtree.fmm import drive_fmm pot = drive_fmm(trav, wrangler, weights) rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot) logger.info("relative l2 error: %g" % rel_err) assert rel_err < 1e-5
def test_fmm_completeness( ctx_getter, dims, nsources_req, ntargets_req, who_has_extent, source_gen, target_gen, filter_kind ): """Tests whether the built FMM traversal structures and driver completely capture all interactions. """ sources_have_extent = "s" in who_has_extent targets_have_extent = "t" in who_has_extent logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) dtype = np.float64 try: sources = source_gen(queue, nsources_req, dims, dtype, seed=15) nsources = len(sources[0]) if ntargets_req is None: # This says "same as sources" to the tree builder. targets = None ntargets = ntargets_req else: targets = target_gen(queue, ntargets_req, dims, dtype, seed=16) ntargets = len(targets[0]) except ImportError: pytest.skip("loo.py not available, but needed for particle array " "generation") from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=13) if sources_have_extent: source_radii = 2 ** rng.uniform(queue, nsources, dtype=dtype, a=-10, b=0) else: source_radii = None if targets_have_extent: target_radii = 2 ** rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) else: target_radii = None from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb( queue, sources, targets=targets, max_particles_in_box=30, source_radii=source_radii, target_radii=target_radii, debug=True, ) if 0: tree.get().plot() import matplotlib.pyplot as pt pt.show() from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) if trav.sep_close_smaller_starts is not None: trav = trav.merge_close_lists(queue) weights = np.random.randn(nsources) # weights = np.ones(nsources) weights_sum = np.sum(weights) host_trav = trav.get(queue=queue) host_tree = host_trav.tree if filter_kind: flags = rng.uniform(queue, ntargets or nsources, np.int32, a=0, b=2).astype(np.int8) if filter_kind == "user": from boxtree.tree import filter_target_lists_in_user_order filtered_targets = filter_target_lists_in_user_order(queue, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder( host_tree, filtered_targets.get(queue=queue) ) elif filter_kind == "tree": from boxtree.tree import filter_target_lists_in_tree_order filtered_targets = filter_target_lists_in_tree_order(queue, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder( host_tree, filtered_targets.get(queue=queue) ) else: raise ValueError("unsupported value of 'filter_kind'") else: wrangler = ConstantOneExpansionWrangler(host_tree) if ntargets is None and not filter_kind: # This check only works for targets == sources. assert (wrangler.reorder_potentials(wrangler.reorder_sources(weights)) == weights).all() from boxtree.fmm import drive_fmm pot = drive_fmm(host_trav, wrangler, weights) # {{{ build, evaluate matrix (and identify missing interactions) if 0: mat = np.zeros((ntargets, nsources), dtype) from pytools import ProgressBar logging.getLogger().setLevel(logging.WARNING) pb = ProgressBar("matrix", nsources) for i in range(nsources): unit_vec = np.zeros(nsources, dtype=dtype) unit_vec[i] = 1 mat[:, i] = drive_fmm(host_trav, wrangler, unit_vec) pb.progress() pb.finished() logging.getLogger().setLevel(logging.INFO) import matplotlib.pyplot as pt if 1: pt.spy(mat) pt.show() missing_tgts, missing_srcs = np.where(mat == 0) if 1 and len(missing_tgts): from boxtree.visualization import TreePlotter plotter = TreePlotter(host_tree) plotter.draw_tree(fill=False, edgecolor="black") plotter.draw_box_numbers() plotter.set_bounding_box() tree_order_missing_tgts = host_tree.indices_to_tree_target_order(missing_tgts) tree_order_missing_srcs = host_tree.indices_to_tree_source_order(missing_srcs) src_boxes = [host_tree.find_box_nr_for_source(i) for i in tree_order_missing_srcs] tgt_boxes = [host_tree.find_box_nr_for_target(i) for i in tree_order_missing_tgts] print(src_boxes) print(tgt_boxes) pt.plot(host_tree.targets[0][tree_order_missing_tgts], host_tree.targets[1][tree_order_missing_tgts], "rv") pt.plot(host_tree.sources[0][tree_order_missing_srcs], host_tree.sources[1][tree_order_missing_srcs], "go") pt.gca().set_aspect("equal") pt.show() # }}} if filter_kind: pot = pot[flags.get() > 0] rel_err = la.norm((pot - weights_sum) / nsources) good = rel_err < 1e-8 if 0 and not good: import matplotlib.pyplot as pt pt.plot(pot - weights_sum) pt.show() if 0 and not good: import matplotlib.pyplot as pt filt_targets = [host_tree.targets[0][flags.get() > 0], host_tree.targets[1][flags.get() > 0]] host_tree.plot() bad = np.abs(pot - weights_sum) >= 1e-3 bad_targets = [filt_targets[0][bad], filt_targets[1][bad]] print(bad_targets[0].shape) pt.plot(filt_targets[0], filt_targets[1], "x") pt.plot(bad_targets[0], bad_targets[1], "v") pt.show() assert good
def setup_rng(self): self.rng = RanluxGenerator(self.cl_queue)
# STARTEXAMPLE import pyopencl as cl import numpy as np from six.moves import range ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dims = 2 nparticles = 500 # ----------------------------------------------------------------------------- # generate some random particle positions # ----------------------------------------------------------------------------- from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from pytools.obj_array import make_obj_array particles = make_obj_array([ rng.normal(queue, nparticles, dtype=np.float64) for i in range(dims)]) # ----------------------------------------------------------------------------- # build tree and traversals (lists) # ----------------------------------------------------------------------------- from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, particles, max_particles_in_box=5) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx)
if ((local_index & mask) == 0) { float other = scratch[local_index + offset]; float mine = scratch[local_index]; scratch[local_index] = (mine < other) ? mine : other; } barrier(CLK_LOCAL_MEM_FENCE); } if (local_index == 0) { result[get_group_id(0)] = scratch[0]; } } """ gen = RanluxGenerator(queue, nPaths, luxury=4, seed=time.time()) #gen = RanluxGenerator(queue, nPaths, luxury=4) ran = cl.array.zeros(queue, nPaths, numpy.float32) latestStep = cl.array.empty_like(ran) averages = numpy.zeros(nLoops) #averages = cl.array.zeros(queue, nLoops, numpy.float32) tStartMonte = time.time() theSum = 0 for loop in range(0, nLoops): latestStep.fill(S_init)
def test_pyfmmlib_fmm(ctx_getter): logging.basicConfig(level=logging.INFO) from pytest import importorskip importorskip("pyfmmlib") ctx = ctx_getter() queue = cl.CommandQueue(ctx) nsources = 3000 ntargets = 1000 dims = 2 dtype = np.float64 helmholtz_k = 2 sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = (p_normal(queue, ntargets, dims, dtype, seed=18) + np.array([2, 0])) sources_host = particle_array_to_host(sources) targets_host = particle_array_to_host(targets) from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) trav = trav.get(queue=queue) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=20) weights = rng.uniform(queue, nsources, dtype=np.float64).get() #weights = np.ones(nsources) logger.info("computing direct (reference) result") from pyfmmlib import hpotgrad2dall_vec ref_pot, _, _ = hpotgrad2dall_vec(ifgrad=False, ifhess=False, sources=sources_host.T, charge=weights, targets=targets_host.T, zk=helmholtz_k) from boxtree.pyfmmlib_integration import Helmholtz2DExpansionWrangler wrangler = Helmholtz2DExpansionWrangler(trav.tree, helmholtz_k, nterms=10) from boxtree.fmm import drive_fmm pot = drive_fmm(trav, wrangler, weights) rel_err = la.norm(pot - ref_pot) / la.norm(ref_pot) logger.info("relative l2 error: %g" % rel_err) assert rel_err < 1e-5
def test_fmm_completeness(ctx_getter, dims, nsources_req, ntargets_req, who_has_extent, source_gen, target_gen, filter_kind): """Tests whether the built FMM traversal structures and driver completely capture all interactions. """ sources_have_extent = "s" in who_has_extent targets_have_extent = "t" in who_has_extent logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) dtype = np.float64 try: sources = source_gen(queue, nsources_req, dims, dtype, seed=15) nsources = len(sources[0]) if ntargets_req is None: # This says "same as sources" to the tree builder. targets = None ntargets = ntargets_req else: targets = target_gen(queue, ntargets_req, dims, dtype, seed=16) ntargets = len(targets[0]) except ImportError: pytest.skip("loo.py not available, but needed for particle array " "generation") from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=13) if sources_have_extent: source_radii = 2**rng.uniform(queue, nsources, dtype=dtype, a=-10, b=0) else: source_radii = None if targets_have_extent: target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) else: target_radii = None from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=30, source_radii=source_radii, target_radii=target_radii, debug=True) if 0: tree.get().plot() import matplotlib.pyplot as pt pt.show() from boxtree.traversal import FMMTraversalBuilder tbuild = FMMTraversalBuilder(ctx) trav, _ = tbuild(queue, tree, debug=True) if trav.sep_close_smaller_starts is not None: trav = trav.merge_close_lists(queue) weights = np.random.randn(nsources) #weights = np.ones(nsources) weights_sum = np.sum(weights) host_trav = trav.get(queue=queue) host_tree = host_trav.tree if filter_kind: flags = rng.uniform(queue, ntargets or nsources, np.int32, a=0, b=2) \ .astype(np.int8) if filter_kind == "user": from boxtree.tree import filter_target_lists_in_user_order filtered_targets = filter_target_lists_in_user_order( queue, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder( host_tree, filtered_targets.get(queue=queue)) elif filter_kind == "tree": from boxtree.tree import filter_target_lists_in_tree_order filtered_targets = filter_target_lists_in_tree_order( queue, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder( host_tree, filtered_targets.get(queue=queue)) else: raise ValueError("unsupported value of 'filter_kind'") else: wrangler = ConstantOneExpansionWrangler(host_tree) if ntargets is None and not filter_kind: # This check only works for targets == sources. assert (wrangler.reorder_potentials( wrangler.reorder_sources(weights)) == weights).all() from boxtree.fmm import drive_fmm pot = drive_fmm(host_trav, wrangler, weights) # {{{ build, evaluate matrix (and identify missing interactions) if 0: mat = np.zeros((ntargets, nsources), dtype) from pytools import ProgressBar logging.getLogger().setLevel(logging.WARNING) pb = ProgressBar("matrix", nsources) for i in range(nsources): unit_vec = np.zeros(nsources, dtype=dtype) unit_vec[i] = 1 mat[:, i] = drive_fmm(host_trav, wrangler, unit_vec) pb.progress() pb.finished() logging.getLogger().setLevel(logging.INFO) import matplotlib.pyplot as pt if 1: pt.spy(mat) pt.show() missing_tgts, missing_srcs = np.where(mat == 0) if 1 and len(missing_tgts): from boxtree.visualization import TreePlotter plotter = TreePlotter(host_tree) plotter.draw_tree(fill=False, edgecolor="black") plotter.draw_box_numbers() plotter.set_bounding_box() tree_order_missing_tgts = \ host_tree.indices_to_tree_target_order(missing_tgts) tree_order_missing_srcs = \ host_tree.indices_to_tree_source_order(missing_srcs) src_boxes = [ host_tree.find_box_nr_for_source(i) for i in tree_order_missing_srcs ] tgt_boxes = [ host_tree.find_box_nr_for_target(i) for i in tree_order_missing_tgts ] print(src_boxes) print(tgt_boxes) pt.plot(host_tree.targets[0][tree_order_missing_tgts], host_tree.targets[1][tree_order_missing_tgts], "rv") pt.plot(host_tree.sources[0][tree_order_missing_srcs], host_tree.sources[1][tree_order_missing_srcs], "go") pt.gca().set_aspect("equal") pt.show() # }}} if filter_kind: pot = pot[flags.get() > 0] rel_err = la.norm((pot - weights_sum) / nsources) good = rel_err < 1e-8 if 0 and not good: import matplotlib.pyplot as pt pt.plot(pot - weights_sum) pt.show() if 0 and not good: import matplotlib.pyplot as pt filt_targets = [ host_tree.targets[0][flags.get() > 0], host_tree.targets[1][flags.get() > 0], ] host_tree.plot() bad = np.abs(pot - weights_sum) >= 1e-3 bad_targets = [ filt_targets[0][bad], filt_targets[1][bad], ] print(bad_targets[0].shape) pt.plot(filt_targets[0], filt_targets[1], "x") pt.plot(bad_targets[0], bad_targets[1], "v") pt.show() assert good
def plot_traversal(ctx_getter, do_plot=False): ctx = ctx_getter() queue = cl.CommandQueue(ctx) #for dims in [2, 3]: for dims in [2]: nparticles = 10**4 dtype = np.float64 from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from pytools.obj_array import make_obj_array particles = make_obj_array([ rng.normal(queue, nparticles, dtype=dtype) for i in range(dims)]) #if do_plot: #pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree = tb(queue, particles, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx) trav = tg(queue, tree).get() from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) plotter.draw_tree(fill=False, edgecolor="black") #plotter.draw_box_numbers() plotter.set_bounding_box() from random import randrange, seed seed(7) # {{{ generic box drawing helper def draw_some_box_lists(starts, lists, key_to_box=None, count=5): actual_count = 0 while actual_count < count: if key_to_box is not None: key = randrange(len(key_to_box)) ibox = key_to_box[key] else: key = ibox = randrange(tree.nboxes) start, end = starts[key:key+2] if start == end: continue #print ibox, start, end, lists[start:end] for jbox in lists[start:end]: plotter.draw_box(jbox, facecolor='yellow') plotter.draw_box(ibox, facecolor='red') actual_count += 1 # }}} if 0: # colleagues draw_some_box_lists( trav.colleagues_starts, trav.colleagues_lists) elif 0: # near neighbors ("list 1") draw_some_box_lists( trav.neighbor_leaves_starts, trav.neighbor_leaves_lists, key_to_box=trav.source_boxes) elif 0: # well-separated siblings (list 2) draw_some_box_lists( trav.sep_siblings_starts, trav.sep_siblings_lists) elif 1: # separated smaller (list 3) draw_some_box_lists( trav.sep_smaller_starts, trav.sep_smaller_lists, key_to_box=trav.source_boxes) elif 1: # separated bigger (list 4) draw_some_box_lists( trav.sep_bigger_starts, trav.sep_bigger_lists) import matplotlib.pyplot as pt pt.show()
def test_extent_tree(ctx_getter, dims, do_plot=False): logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) nsources = 100000 ntargets = 200000 dtype = np.float64 npoint_sources_per_source = 16 sources = make_normal_particle_array(queue, nsources, dims, dtype, seed=12) targets = make_normal_particle_array(queue, ntargets, dims, dtype, seed=19) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=13) source_radii = 2**rng.uniform(queue, nsources, dtype=dtype, a=-10, b=0) target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() dev_tree, _ = tb(queue, sources, targets=targets, source_radii=source_radii, target_radii=target_radii, max_particles_in_box=10, debug=True) logger.info("transfer tree, check orderings") tree = dev_tree.get(queue=queue) sorted_sources = np.array(list(tree.sources)) sorted_targets = np.array(list(tree.targets)) sorted_source_radii = tree.source_radii sorted_target_radii = tree.target_radii unsorted_sources = np.array([pi.get() for pi in sources]) unsorted_targets = np.array([pi.get() for pi in targets]) unsorted_source_radii = source_radii.get() unsorted_target_radii = target_radii.get() assert (sorted_sources == unsorted_sources[:, tree.user_source_ids]).all() assert (sorted_source_radii == unsorted_source_radii[tree.user_source_ids]).all() # {{{ test box structure, stick-out criterion logger.info("test box structure, stick-out criterion") user_target_ids = np.empty(tree.ntargets, dtype=np.intp) user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets, dtype=np.intp) if ntargets: assert (sorted_targets == unsorted_targets[:, user_target_ids]).all() assert (sorted_target_radii == unsorted_target_radii[user_target_ids]).all() all_good_so_far = True # {{{ check sources, targets for ibox in range(tree.nboxes): extent_low, extent_high = tree.get_box_extent(ibox) box_radius = np.max(extent_high-extent_low) * 0.5 stick_out_dist = tree.stick_out_factor * box_radius assert (extent_low >= tree.bounding_box[0] - 1e-12*tree.root_extent).all(), ibox assert (extent_high <= tree.bounding_box[1] + 1e-12*tree.root_extent).all(), ibox box_children = tree.box_child_ids[:, ibox] existing_children = box_children[box_children != 0] assert (tree.box_source_counts_nonchild[ibox] + np.sum(tree.box_source_counts_cumul[existing_children]) == tree.box_source_counts_cumul[ibox]) assert (tree.box_target_counts_nonchild[ibox] + np.sum(tree.box_target_counts_cumul[existing_children]) == tree.box_target_counts_cumul[ibox]) for what, starts, counts, points, radii in [ ("source", tree.box_source_starts, tree.box_source_counts_cumul, sorted_sources, sorted_source_radii), ("target", tree.box_target_starts, tree.box_target_counts_cumul, sorted_targets, sorted_target_radii), ]: bstart = starts[ibox] bslice = slice(bstart, bstart+counts[ibox]) check_particles = points[:, bslice] check_radii = radii[bslice] good = ( (check_particles + check_radii < extent_high[:, np.newaxis] + stick_out_dist) & (extent_low[:, np.newaxis] - stick_out_dist <= check_particles - check_radii) ).all(axis=0) all_good_here = good.all() if not all_good_here: print("BAD BOX %s %d level %d" % (what, ibox, tree.box_levels[ibox])) all_good_so_far = all_good_so_far and all_good_here assert all_good_here # }}} assert all_good_so_far # }}} # {{{ create, link point sources logger.info("creating point sources") np.random.seed(20) from pytools.obj_array import make_obj_array point_sources = make_obj_array([ cl.array.to_device(queue, unsorted_sources[i][:, np.newaxis] + unsorted_source_radii[:, np.newaxis] * np.random.uniform( -1, 1, size=(nsources, npoint_sources_per_source)) ) for i in range(dims)]) point_source_starts = cl.array.arange(queue, 0, (nsources+1)*npoint_sources_per_source, npoint_sources_per_source, dtype=tree.particle_id_dtype) from boxtree.tree import link_point_sources dev_tree = link_point_sources(queue, dev_tree, point_source_starts, point_sources, debug=True)
def valueMonteCarloGPU(ctx, queue, S_init, nPaths, Exp_Time, dtMonte, Strike, Int_Rate, Vol, PTYPE, nMonteLoops=1): nextStepPathKernel = ElementwiseKernel( ctx, "float *latestStep, float *ran, float Strike, float Int_Rate, float Exp_Time, float dt, float Vol", "float rval = exp((Int_Rate - 0.5f * Vol*Vol)*dt + Vol * sqrt(dt) * ran[i]); latestStep[i] *= rval;", "nextStepPathKernel") excersisePriceKernel = ElementwiseKernel( ctx, "float *latestStep, float Strike, float Int_Rate, float Exp_Time", "float rval = (latestStep[i]-Strike); latestStep[i] = exp(-Int_Rate*Exp_Time) * max(rval,0.0f);", "excersisePriceKernel") sumKernel = ReductionKernel(ctx, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x") maxWorkItems = 1 * 2**9 multiplier = 1 if (nPaths > maxWorkItems): multiplier = math.ceil(nPaths / maxWorkItems) nPaths = multiplier * maxWorkItems else: maxWorkItems = nPaths #print(maxWorkItems, multiplier, nPaths) nTimeStepsMonte = math.ceil(Exp_Time / dtMonte) #print(nTimeStepsMonte,nMonteLoops) #set up random number generator gen = RanluxGenerator(queue, maxWorkItems, luxury=4, seed=time.time()) #the arrays ran = cl.array.zeros(queue, maxWorkItems, numpy.float32) latestStep = cl.array.zeros_like(ran) means = numpy.zeros(nMonteLoops) theMean = 0 #the loop for loop in range(nMonteLoops): theSum = 0 for mult in range(multiplier): latestStep.fill(S_init) for t in range(nTimeStepsMonte): gen.fill_normal(ran) gen.synchronize(queue) nextStepPathKernel(latestStep, ran, Strike, Int_Rate, Exp_Time, dtMonte, Vol) excersisePriceKernel(latestStep, Strike, Int_Rate, Exp_Time) #print(latestStep) #add to array theSum += sumKernel(latestStep, queue).get() means[loop] = theSum / nPaths monteAverage = numpy.mean(means) monteStdDeviation = numpy.std(means) return monteAverage, dtMonte, monteStdDeviation
def plot_traversal(ctx_getter, do_plot=False): ctx = ctx_getter() queue = cl.CommandQueue(ctx) #for dims in [2, 3]: for dims in [2]: nparticles = 10**4 dtype = np.float64 from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from pytools.obj_array import make_obj_array particles = make_obj_array( [rng.normal(queue, nparticles, dtype=dtype) for i in range(dims)]) #if do_plot: #pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree = tb(queue, particles, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx) trav = tg(queue, tree).get() from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) plotter.draw_tree(fill=False, edgecolor="black") #plotter.draw_box_numbers() plotter.set_bounding_box() from random import randrange, seed seed(7) # {{{ generic box drawing helper def draw_some_box_lists(starts, lists, key_to_box=None, count=5): actual_count = 0 while actual_count < count: if key_to_box is not None: key = randrange(len(key_to_box)) ibox = key_to_box[key] else: key = ibox = randrange(tree.nboxes) start, end = starts[key:key + 2] if start == end: continue #print ibox, start, end, lists[start:end] for jbox in lists[start:end]: plotter.draw_box(jbox, facecolor='yellow') plotter.draw_box(ibox, facecolor='red') actual_count += 1 # }}} if 0: # colleagues draw_some_box_lists(trav.colleagues_starts, trav.colleagues_lists) elif 0: # near neighbors ("list 1") draw_some_box_lists(trav.neighbor_leaves_starts, trav.neighbor_leaves_lists, key_to_box=trav.source_boxes) elif 0: # well-separated siblings (list 2) draw_some_box_lists(trav.sep_siblings_starts, trav.sep_siblings_lists) elif 1: # separated smaller (list 3) draw_some_box_lists(trav.sep_smaller_starts, trav.sep_smaller_lists, key_to_box=trav.source_boxes) elif 1: # separated bigger (list 4) draw_some_box_lists(trav.sep_bigger_starts, trav.sep_bigger_lists) import matplotlib.pyplot as pt pt.show()
if ((local_index & mask) == 0) { float other = scratch[local_index + offset]; float mine = scratch[local_index]; scratch[local_index] = (mine < other) ? mine : other; } barrier(CLK_LOCAL_MEM_FENCE); } if (local_index == 0) { result[get_group_id(0)] = scratch[0]; } } """ gen = RanluxGenerator(queue, nPaths, luxury=4, seed=time.time()) #gen = RanluxGenerator(queue, nPaths, luxury=4) ran = cl.array.zeros(queue, nPaths, numpy.float32) latestStep = cl.array.empty_like(ran) averages = numpy.zeros(nLoops) #averages = cl.array.zeros(queue, nLoops, numpy.float32) tStartMonte = time.time() theSum = 0 for loop in range(0,nLoops): latestStep.fill(S_init)
class GeneticAlgorithmOpenCL(GeneticAlgorithm): class Population: # Crossover modes CM_SEPARATE = 0 # Separate probabilities for translation/rotation genes CM_COMBINE = 1 # Combined probability for translation/rotation genes def __init__(self, size = 0, dna_size = 0, \ cl_ctx = None, cl_queue = None, rng = None, cl_prg = None): self.size = size self.dna_size = dna_size # OpenCL self.cl_ctx = cl_ctx self.cl_queue = cl_queue self.rng = rng self.cl_prg = cl_prg # Matrix of i by j for individuals and genes (DNA) respectively self.individuals_np = None self.individuals_buf = None self.new_individuals_np = None self.new_individuals_buf = None self.crossover_translation_mode_np = np.array([], dtype = int) self.crossover_translation_mode_buf = None self.crossover_rotation_mode_np = np.array([], dtype = int) self.crossover_rotation_mode_buf = None self.crossover_probability_np = np.array([], dtype = float) self.crossover_probability_buf = None self.mutation_probability_np = np.array([], dtype = float) self.mutation_probability_buf = None def setup_opencl(self): mf = cl.mem_flags # Setup device buffers self.individuals_buf = cl.array.zeros(self.cl_queue, \ (self.size, self.dna_size), \ dtype = float) self.new_individuals_buf = cl.array.zeros(self.cl_queue, \ (self.size, self.dna_size), \ dtype = float) self.crossover_translation_mode_np = np.array([self.crossover_translation_mode], \ dtype = int) self.crossover_translation_mode_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.crossover_translation_mode_np) self.crossover_rotation_mode_np = np.array([self.crossover_rotation_mode], \ dtype = int) self.crossover_rotation_mode_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.crossover_rotation_mode_np) self.crossover_probability_np = np.array([self.crossover_probability], \ dtype = float) self.crossover_probability_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.crossover_probability_np) self.mutation_probability_np = np.array([self.mutation_probability], \ dtype = float) self.mutation_probability_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.mutation_probability_np) def __repr__(self): self.individuals_np = self.individuals_buf.get() ret = "Individuals:\n" for idx, individual in enumerate(self.individuals_np): ret += "[%3d] " % (idx + 1) ret += "%6.2f %6.2f %6.2f | " % (individual[0], \ individual[1], \ individual[2]) ret += "%6.2f %6.2fi %6.2fj %6.2fk | " % (individual[3], \ individual[4], \ individual[5], \ individual[6]) for torsion in individual[7:]: ret += " %5.2f" % torsion ret += "\n" return ret def get_individual(self, idx = 0): self.individuals_np = self.individuals_buf.get() return self.individuals_np[idx] def create(self, dna_size_buf = None, dock = None): self.rng.fill_uniform(self.individuals_buf) # Construct individuals self.cl_prg.construct_individuals(self.cl_queue, \ (self.size,), None, \ dock.lo_grid_buf, \ dock.dist_grid_buf, \ dna_size_buf, \ self.individuals_buf.data) def scoring(self, dock = None, \ cl_ctx = None, cl_queue = None): dock.reset_poses(self.size, self.individuals_buf, \ cl_ctx, cl_queue) dock.calc_energy() def min_score(self, dock = None): scores = dock.e_totals_buf.get() return scores.min() def crossover(self, parents_idx, ttl_torsions, rng): return None def mutate(self, individual, mutation_chance, \ lo_grid, hi_grid, ttl_torsions, rng): return None class Settler(Population): def __init__(self, size = 0, dna_size = 0, \ cl_ctx = None, cl_queue = None, rng = None, cl_prg = None): GeneticAlgorithmOpenCL.Population.__init__(self, size, dna_size, \ cl_ctx, cl_queue, \ rng, cl_prg) self.crossover_translation_mode = self.CM_COMBINE self.crossover_rotation_mode = self.CM_COMBINE self.crossover_probability = 0.5 self.mutation_probability = 0.25 # OpenCL self.setup_opencl() class Nomad(Population): def __init__(self, size = 0, dna_size = 0, \ cl_ctx = None, cl_queue = None, rng = None, cl_prg = None): GeneticAlgorithmOpenCL.Population.__init__(self, size, dna_size, \ cl_ctx, cl_queue, \ rng, cl_prg) self.crossover_translation_mode = self.CM_SEPARATE self.crossover_rotation_mode = self.CM_SEPARATE self.crossover_probability = 0.5 self.mutation_probability = 0.75 # OpenCL self.setup_opencl() def __init__(self, dock = None, cl_device_type = None): GeneticAlgorithm.__init__(self, dock) # OpenCL self.cl_device_type = cl_device_type if self.cl_device_type == "gpu": self.cl_ctx = cl.Context(dev_type = cl.device_type.GPU) elif self.cl_device_type == "cpu": self.cl_ctx = cl.Context(dev_type = cl.device_type.CPU) else: # manual selection self.cl_ctx = cl.create_some_context() self.cl_queue = cl.CommandQueue(self.cl_ctx) self.cl_filename = "./OpenCL/GeneticAlgorithm.cl" fh = open(self.cl_filename, 'r') cl_code = "".join(fh.readlines()) self.cl_prg = cl.Program(self.cl_ctx, cl_code).build() self.rng = None # OpenCL buffer self.population_size_np = np.array([], dtype = int) self.population_size_buf = None self.dna_size_np = np.array([], dtype = int) self.dna_size_buf = None self.max_inherited_prob_np = np.array([], dtype = int) self.max_inherited_prob_buf = None self.normalizer_np = np.array([], dtype = int) self.normalizer_buf = None self.chances_np = None self.chances_buf = None self.chances_sum_buf = None self.dna1_buf = None self.dna2_buf = None self.ttl_reproduction_rns_np = np.array([], dtype = int) self.ttl_reproduction_rns_buf = None self.reproduction_rns_buf = None self.mutation_chance_np = np.array([], dtype = float) self.mutation_chance_buf = None def setup_opencl(self): # OpenCL setup self.dock.setup_opencl(self.cl_ctx, self.cl_queue) # Setup OpenCL device buffer mf = cl.mem_flags self.population_size_np = np.array([self.population_size], dtype = int) self.population_size_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.population_size_np) self.dna_size = 3 + 4 + self.dock.get_total_torsions() self.dna_size_np = np.array([self.dna_size], dtype = int) self.dna_size_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.dna_size_np) self.max_inherited_prob_np = np.array([self.max_inherited_prob], \ dtype = int) self.max_inherited_prob_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.max_inherited_prob_np) self.normalizer_np = np.array([self.ttl_ligand_atoms], dtype = int) self.normalizer_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.normalizer_np) self.chances_buf = cl.array.zeros(self.cl_queue, (self.population_size), \ dtype = int) self.chances_sum_buf = cl.array.zeros(self.cl_queue, (self.population_size), \ dtype = int) self.dna1_buf = cl.array.zeros(self.cl_queue, \ (self.population_size, self.dna_size), \ dtype = float) self.dna2_buf = cl.array.zeros(self.cl_queue, \ (self.population_size, self.dna_size), \ dtype = float) # Reproduction random numbers needed per individual: # - Selecting parents: 2 # - Crossing over: Use new_individuals_buf # - Mutation: 1 + 2 + total torsions # - Mutation pose: 3 + 4 + total torsions ttl_reproduction_rns = 2 + 1 + 2 + self.ttl_torsions + \ 3 + 4 + self.ttl_torsions self.ttl_reproduction_rns_np = np.array([ttl_reproduction_rns], \ dtype = int) self.ttl_reproduction_rns_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.ttl_reproduction_rns_np) self.reproduction_rns_buf = cl.array.zeros(self.cl_queue, \ (self.population_size, ttl_reproduction_rns), \ dtype = float) self.mutation_chance_np = np.array([self.mutation_chance], dtype = float) self.mutation_chance_buf = cl.Buffer(self.cl_ctx, \ mf.READ_ONLY | mf.COPY_HOST_PTR, \ hostbuf = self.mutation_chance_np) # Setup OpenCL buffer for docking object self.dock.setup_opencl_buffer(self.population_size, \ self.cl_ctx, self.cl_queue) def setup_rng(self): self.rng = RanluxGenerator(self.cl_queue) def setup(self): # Call parent setup GeneticAlgorithm.setup(self) # OpenCL self.setup_opencl() def select(self, population): # Get individual scores population.scoring(self.dock, self.cl_ctx, self.cl_queue) self.cl_prg.calc_chances(self.cl_queue, (self.population_size,), None, \ self.dock.e_totals_buf.data, \ self.normalizer_buf, \ self.max_inherited_prob_buf, \ self.chances_buf.data) def reproduce(self, population): self.rng.fill_uniform(population.new_individuals_buf) self.rng.fill_uniform(self.reproduction_rns_buf) self.cl_prg.reproduce(self.cl_queue, (self.population_size,), None, \ self.population_size_buf, \ self.chances_buf.data, \ self.ttl_reproduction_rns_buf, \ self.reproduction_rns_buf.data, \ self.dna_size_buf, \ population.individuals_buf.data, \ population.crossover_translation_mode_buf, \ population.crossover_rotation_mode_buf, \ population.crossover_probability_buf, \ self.mutation_chance_buf, \ population.mutation_probability_buf, \ self.dock.ttl_torsions_buf, \ self.dock.lo_grid_buf, \ self.dock.dist_grid_buf, \ self.chances_sum_buf.data, \ self.dna1_buf.data, \ self.dna2_buf.data, \ population.new_individuals_buf.data) cl.enqueue_copy(self.cl_queue, population.individuals_buf.data, \ population.new_individuals_buf.data) def run(self): self.setup() # Define multiple population self.nomad = self.Nomad(self.population_size, self.dna_size, \ self.cl_ctx, self.cl_queue, \ self.rng, self.cl_prg) self.settler = self.Settler(self.population_size, self.dna_size, \ self.cl_ctx, self.cl_queue, \ self.rng, self.cl_prg) population_min_scores = [] for community_idx in xrange(self.community_size): tic = time() # Nomad portion nomad_min_score = float("inf") self.nomad.create(self.dna_size_buf, self.dock) if VERBOSE: print self.nomad for gen_idx in xrange(self.num_gen): self.select(self.nomad) self.reproduce(self.nomad) nomad_min_score = self.nomad.min_score(self.dock) # Settler portion settler_min_score = float("inf") cl.enqueue_copy(self.cl_queue, self.settler.individuals_buf.data, \ self.nomad.individuals_buf.data) if VERBOSE: print self.settler for gen_idx in xrange(self.num_gen): self.select(self.settler) self.reproduce(self.settler) if VERBOSE: print self.settler settler_min_score = self.settler.min_score(self.dock) population_min_scores.append([nomad_min_score, settler_min_score]) toc = time() print "Elapsed time community %4d: %10.2f - Minimum Scores: %12.3f, %12.3f" \ % (community_idx + 1, toc - tic, \ nomad_min_score, settler_min_score) print "Community Minimum Scores: %s" % population_min_scores