iwlB = 15

if op == "nt":
    dim1 = (k, m)
    dim2 = (k, n)
elif op == "nn":
    dim1 = (m, k)
    dim2 = (k, n)
elif op == "tn":
    dim1 = (m, k)
    dim2 = (n, k)

A1 = np.random.randint(0x0, 0x7fff, size=dim1).astype(np.int64)
B1 = np.random.randint(0x0, 0x7fff, size=dim2).astype(np.int64)

A2 = fp.array(A1.astype(np.int16), iwlA)
B2 = fp.array(B1.astype(np.int16), iwlB)

# pick a reasonable output integer word length
iwlC = ((struct.unpack('I', struct.pack('f', float(
    0x7fff * 0x7fff * k / 2)))[0] & 0x7f800000) >> 23) - 126

C2 = fp.empty((m, n), iwlC)

start = drv.Event()
end = drv.Event()

start.record()

for r in range(repeat):
    if op == 'nt':
예제 #2
0
Q = (W*upscale_x - S + 1 + 2*padding_x) // strides_x

padding = (padding_z, padding_y, padding_x)
strides = (strides_z, strides_y, strides_x)
upscale = (upscale_z, upscale_y, upscale_x)

dimO = (K,M,P,Q,N)

# create random input into kernels and allocate output and bit widths
# NOTE: higher entropy will force chip to lower clocks.
I = np.random.randint(0x0, 0x7fff, size=dimI).astype(np.int64)
F = np.random.randint(0x0, 0x7fff, size=dimF).astype(np.int64)
E = np.random.randint(0x0, 0x7fff, size=dimO).astype(np.int64)

# copy to device
devI = fp.array(I, iwl)
devF = fp.array(F, iwl)
devE = fp.array(E, iwl)

# set output bit widths at approximately mean scaling
def scale(n,q):
    return ((struct.unpack('I',struct.pack('f',float(0x7fff**2 * n) / q )))[0] >> 23)-126

iwlO = scale(C*T*R*S, 2)
iwlB = scale(K*T*R*S, 4)
iwlU = scale(N*M*P*Q, 4)

# allocate output 
devO = fp.empty(dimO, iwlO)
devB = fp.zeros(dimI, iwlB)
devU = fp.zeros(dimF, iwlU)
iwlB   = 15

if op == "nt":
    dim1 = (k,m)
    dim2 = (k,n)
elif op == "nn":
    dim1 = (m,k)
    dim2 = (k,n)
elif op == "tn":
    dim1 = (m,k)
    dim2 = (n,k)

A1 = np.random.randint(0x0, 0x7fff, size=dim1).astype(np.int64)
B1 = np.random.randint(0x0, 0x7fff, size=dim2).astype(np.int64)

A2 = fp.array(A1.astype(np.int16), iwlA)
B2 = fp.array(B1.astype(np.int16), iwlB)

# pick a reasonable output integer word length
iwlC = ((struct.unpack('I',struct.pack('f',float(0x7fff * 0x7fff * k / 2)))[0] & 0x7f800000) >> 23)-126

C2 = fp.empty((m,n), iwlC)

start = drv.Event()
end   = drv.Event()

start.record()

for r in range(repeat):
    if   op == 'nt':
        fp.dot(A2.T, B2, C2)