iwlB = 15 if op == "nt": dim1 = (k, m) dim2 = (k, n) elif op == "nn": dim1 = (m, k) dim2 = (k, n) elif op == "tn": dim1 = (m, k) dim2 = (n, k) A1 = np.random.randint(0x0, 0x7fff, size=dim1).astype(np.int64) B1 = np.random.randint(0x0, 0x7fff, size=dim2).astype(np.int64) A2 = fp.array(A1.astype(np.int16), iwlA) B2 = fp.array(B1.astype(np.int16), iwlB) # pick a reasonable output integer word length iwlC = ((struct.unpack('I', struct.pack('f', float( 0x7fff * 0x7fff * k / 2)))[0] & 0x7f800000) >> 23) - 126 C2 = fp.empty((m, n), iwlC) start = drv.Event() end = drv.Event() start.record() for r in range(repeat): if op == 'nt':
Q = (W*upscale_x - S + 1 + 2*padding_x) // strides_x padding = (padding_z, padding_y, padding_x) strides = (strides_z, strides_y, strides_x) upscale = (upscale_z, upscale_y, upscale_x) dimO = (K,M,P,Q,N) # create random input into kernels and allocate output and bit widths # NOTE: higher entropy will force chip to lower clocks. I = np.random.randint(0x0, 0x7fff, size=dimI).astype(np.int64) F = np.random.randint(0x0, 0x7fff, size=dimF).astype(np.int64) E = np.random.randint(0x0, 0x7fff, size=dimO).astype(np.int64) # copy to device devI = fp.array(I, iwl) devF = fp.array(F, iwl) devE = fp.array(E, iwl) # set output bit widths at approximately mean scaling def scale(n,q): return ((struct.unpack('I',struct.pack('f',float(0x7fff**2 * n) / q )))[0] >> 23)-126 iwlO = scale(C*T*R*S, 2) iwlB = scale(K*T*R*S, 4) iwlU = scale(N*M*P*Q, 4) # allocate output devO = fp.empty(dimO, iwlO) devB = fp.zeros(dimI, iwlB) devU = fp.zeros(dimF, iwlU)
iwlB = 15 if op == "nt": dim1 = (k,m) dim2 = (k,n) elif op == "nn": dim1 = (m,k) dim2 = (k,n) elif op == "tn": dim1 = (m,k) dim2 = (n,k) A1 = np.random.randint(0x0, 0x7fff, size=dim1).astype(np.int64) B1 = np.random.randint(0x0, 0x7fff, size=dim2).astype(np.int64) A2 = fp.array(A1.astype(np.int16), iwlA) B2 = fp.array(B1.astype(np.int16), iwlB) # pick a reasonable output integer word length iwlC = ((struct.unpack('I',struct.pack('f',float(0x7fff * 0x7fff * k / 2)))[0] & 0x7f800000) >> 23)-126 C2 = fp.empty((m,n), iwlC) start = drv.Event() end = drv.Event() start.record() for r in range(repeat): if op == 'nt': fp.dot(A2.T, B2, C2)