def __init__(self, rng_seed, stochastic_round=False, device_id=0): self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0
def __init__(self, rng_seed, stochastic_round=False, device_id=0): import pycuda.driver as drv drv.init() global ctx ctx = drv.Device(device_id).make_context() import atexit atexit.register(ctx.pop) self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized NervanaGPU with stochastic_round=%s", stochastic_round) self.rng_seed = rng_seed self.rng_init() self.device_id = device_id if device_id is not None else 0
cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb, A.gpudata, lda, beta, C.gpudata, ldc) end.record() end.synchronize() msecs = end.time_since(start) / repeat gflops = (m * n * k * 2.0) / (msecs * 1000000.0) print "%7.3f msecs %4.0f gflops (%s_%s : %d,%d,%d)" % ( msecs, gflops, "cublas", op, m, n, k) np.set_printoptions(threshold=8193, linewidth=600, formatter={'float': lambda x: "% .0f" % x}) ng = NervanaGPU(stochastic_round=False, bench=True) repeat = 1 for dtype in ( np.float16, np.float32, ): for K, C, N in ((32, 4096, 1512), ): for alpha, beta in ((1.0, 0.0), (0.5, 0.5)): for op, dimA, dimB, dimC in ( ("nn", (K, C), (C, N), (K, N)), # fprop ("tn", (K, C), (K, N), (C, N)), # bprop
from pycuda.autoinit import context from nervanagpu import NervanaGPU from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer print context.get_device().name() # Compare results here: # https://github.com/soumith/convnet-benchmarks # number of full iterations loops = 10 # show bechmark details for each layer layer_bench = 0 # show layer stats after each operation print_stats = 0 ng = NervanaGPU(bench=layer_bench) # don't learn, just benchmark momentum = 0.0 learning_rate = 0.0 # common convolutional layer settings conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1} conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0} # traditional pooling pool2 = {"op": "max", "R": 2, "S": 2} pool3 = {"op": "max", "R": 3, "S": 3, "str_h": 2, "str_w": 2} # maxout pooling pool1j2 = {"op": "max", "J": 2} # maxout in the fc layers
#!/usr/bin/python import numpy as np import pycuda.driver as drv from nervanagpu import NervanaGPU from pycuda.autoinit import context from ipdb import set_trace np.set_printoptions(threshold=8192 * 4, linewidth=600, formatter={ 'int': lambda x: "%2d" % x, 'float': lambda x: "%2.0f" % x }) ng = NervanaGPU(stochastic_round=0, bench=1) dtype = np.float32 # np.float16 or np.float32 repeat = 50 # repeat count for benchmarking ones = 0 # simpler data for debugging cpu = 0 # valdiate against numpy size = 32 # 32, 64, 128, None=auto X = 100 # Batch Size N = 32 # Minibatch Size C = 3072 # Input Features K = 3072 # Output Features Nin = True dimW = (K, C) if Nin:
def __init__(self, rng_seed, stochastic_round=False, device_id=0, num_dev=2): drv.init() self.num_dev = num_dev if device_id == 0: self.dev_list = range(num_dev) else: self.dev_list = device_id assert len(self.dev_list) == self.num_dev assert self.num_dev <= drv.Device.count() self.ctxs = [] self.devs = [] self._strms = [] self._redstrms = [] self._events = [] self._redevents = [] self. async = True self._nostrms = [None for i in self.dev_list] for i in self.dev_list: self.devs.append(drv.Device(i)) for dev in self.devs: self.ctxs.append( dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC)) self._strms.append(drv.Stream()) self._redstrms.append(drv.Stream()) self._events.append(drv.Event()) self._redevents.append(drv.Event()) drv.Context.pop() self.ctxs[0].push() atexit.register(drv.Context.pop) MGPUTensor.ctxs = self.ctxs MGPUTensor.num_dev = num_dev self.ng = NervanaGPU(stochastic_round=stochastic_round) logger.info("Initialized %d device NervanaGPU, stochastic_round=%s", num_dev, stochastic_round) self.ng.block = None self.rng_seed = rng_seed self.rng_init() # Setup the pairwise contexts # TODO clean up this code to avoid indexing for dev1, ctx1 in zip(self.devs, self.ctxs): ctx1.push() for dev2, ctx2 in zip(self.devs, self.ctxs): if dev1 == dev2: continue if dev1.can_access_peer(dev2): ctx1.enable_peer_access(ctx2) else: print('Cannot enable peer access between ' '{:d} and {:d}'.format(dev1, dev2)) ctx1.pop()