예제 #1
0
파일: gpu.py 프로젝트: xiaoyunwu/neon
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
예제 #2
0
 def __init__(self, rng_seed, stochastic_round=False, device_id=0):
     import pycuda.driver as drv
     drv.init()
     global ctx
     ctx = drv.Device(device_id).make_context()
     import atexit
     atexit.register(ctx.pop)
     self.ng = NervanaGPU(stochastic_round=stochastic_round)
     logger.info("Initialized NervanaGPU with stochastic_round=%s",
                 stochastic_round)
     self.rng_seed = rng_seed
     self.rng_init()
     self.device_id = device_id if device_id is not None else 0
예제 #3
0
파일: cublas.py 프로젝트: zky001/nervanagpu
        cublas.cublasSgemm(handle, opB, opA, n, m, k, alpha, B.gpudata, ldb,
                           A.gpudata, lda, beta, C.gpudata, ldc)

    end.record()
    end.synchronize()
    msecs = end.time_since(start) / repeat
    gflops = (m * n * k * 2.0) / (msecs * 1000000.0)
    print "%7.3f msecs %4.0f gflops (%s_%s   : %d,%d,%d)" % (
        msecs, gflops, "cublas", op, m, n, k)


np.set_printoptions(threshold=8193,
                    linewidth=600,
                    formatter={'float': lambda x: "% .0f" % x})

ng = NervanaGPU(stochastic_round=False, bench=True)

repeat = 1

for dtype in (
        np.float16,
        np.float32,
):

    for K, C, N in ((32, 4096, 1512), ):

        for alpha, beta in ((1.0, 0.0), (0.5, 0.5)):

            for op, dimA, dimB, dimC in (
                ("nn", (K, C), (C, N), (K, N)),  # fprop
                ("tn", (K, C), (K, N), (C, N)),  # bprop
예제 #4
0
from pycuda.autoinit import context
from nervanagpu import NervanaGPU
from nervanagpu.layers import DataLayer, ConvLayer, PoolLayer, FullLayer
print context.get_device().name()

# Compare results here:
# https://github.com/soumith/convnet-benchmarks

# number of full iterations
loops = 10
# show bechmark details for each layer
layer_bench = 0
# show layer stats after each operation
print_stats = 0

ng = NervanaGPU(bench=layer_bench)

# don't learn, just benchmark
momentum = 0.0
learning_rate = 0.0

# common convolutional layer settings
conv3 = {"R": 3, "S": 3, "pad_h": 1, "pad_w": 1}
conv1 = {"R": 1, "S": 1, "pad_h": 0, "pad_w": 0}

# traditional pooling
pool2 = {"op": "max", "R": 2, "S": 2}
pool3 = {"op": "max", "R": 3, "S": 3, "str_h": 2, "str_w": 2}

# maxout pooling
pool1j2 = {"op": "max", "J": 2}  # maxout in the fc layers
예제 #5
0
#!/usr/bin/python

import numpy as np
import pycuda.driver as drv
from nervanagpu import NervanaGPU
from pycuda.autoinit import context
from ipdb import set_trace

np.set_printoptions(threshold=8192 * 4,
                    linewidth=600,
                    formatter={
                        'int': lambda x: "%2d" % x,
                        'float': lambda x: "%2.0f" % x
                    })

ng = NervanaGPU(stochastic_round=0, bench=1)

dtype = np.float32  # np.float16 or np.float32
repeat = 50  # repeat count for benchmarking
ones = 0  # simpler data for debugging
cpu = 0  # valdiate against numpy
size = 32  # 32, 64, 128, None=auto

X = 100  # Batch Size
N = 32  # Minibatch Size
C = 3072  # Input  Features
K = 3072  # Output Features
Nin = True

dimW = (K, C)
if Nin:
예제 #6
0
    def __init__(self,
                 rng_seed,
                 stochastic_round=False,
                 device_id=0,
                 num_dev=2):
        drv.init()
        self.num_dev = num_dev

        if device_id == 0:
            self.dev_list = range(num_dev)
        else:
            self.dev_list = device_id

        assert len(self.dev_list) == self.num_dev
        assert self.num_dev <= drv.Device.count()

        self.ctxs = []
        self.devs = []
        self._strms = []
        self._redstrms = []

        self._events = []
        self._redevents = []

        self. async = True
        self._nostrms = [None for i in self.dev_list]

        for i in self.dev_list:
            self.devs.append(drv.Device(i))

        for dev in self.devs:
            self.ctxs.append(
                dev.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC))
            self._strms.append(drv.Stream())
            self._redstrms.append(drv.Stream())
            self._events.append(drv.Event())
            self._redevents.append(drv.Event())
            drv.Context.pop()

        self.ctxs[0].push()
        atexit.register(drv.Context.pop)
        MGPUTensor.ctxs = self.ctxs
        MGPUTensor.num_dev = num_dev

        self.ng = NervanaGPU(stochastic_round=stochastic_round)
        logger.info("Initialized %d device NervanaGPU, stochastic_round=%s",
                    num_dev, stochastic_round)
        self.ng.block = None
        self.rng_seed = rng_seed
        self.rng_init()

        # Setup the pairwise contexts
        # TODO clean up this code to avoid indexing
        for dev1, ctx1 in zip(self.devs, self.ctxs):
            ctx1.push()
            for dev2, ctx2 in zip(self.devs, self.ctxs):
                if dev1 == dev2:
                    continue
                if dev1.can_access_peer(dev2):
                    ctx1.enable_peer_access(ctx2)
                else:
                    print('Cannot enable peer access between '
                          '{:d} and {:d}'.format(dev1, dev2))
            ctx1.pop()