def sparse_matmul(self, x, feature_axis, output_dim): """ :param tf.Tensor x: :param int feature_axis: :param int output_dim: :return: y, weights, bsmm :rtype: (tf.Tensor, tf.Variable, object) """ block_size = self.block_size input_dim = x.get_shape().dims[feature_axis].value assert input_dim is not None, "%r shape unknown" % (x,) assert input_dim % block_size == 0 and output_dim % block_size == 0 from blocksparse.matmul import BlocksparseMatMul seed = self.random.randint(2 ** 31) sparsity_pattern = sparsity_pattern_barabasi_albert( n1=input_dim // block_size, n2=output_dim // block_size, m=self.connectivity, dense=self.connectivity_dense, seed=seed) bsmm = BlocksparseMatMul(sparsity_pattern, block_size=block_size, feature_axis=feature_axis) if self.weights_identity_init: weights_init = bsmm.identity_init() else: weights_init = None weights = tf.get_variable("W", shape=bsmm.w_shape, initializer=weights_init) y = bsmm(x, weights) return y, weights, bsmm
def test_blocksparse_simple_feature_axis1(): init_blocksparse() from blocksparse.matmul import BlocksparseMatMul import tensorflow as tf import numpy n_in = 64 n_out = 32 * 32 block_size = 32 n_batch = 4 # Create a dense sparsity pattern mask = numpy.ones((n_in // block_size, n_out // block_size), dtype=numpy.int32) # MatMul object bsmm = BlocksparseMatMul(mask, block_size=block_size, feature_axis=1, name="bsmm") # Input x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape((n_batch, n_in)) + 1.0 x = tf.constant(x_np, name='x') # Block-sparse weights w_np = bsmm.identity_init()() w = tf.constant(w_np, name="w") # Block-sparse matrix multiplication y = bsmm(x, w) y.set_shape((n_batch, n_out)) # Run result = session.run(y) print(result) print('L2:', numpy.sum(result ** 2)) y_test = bsmm.fprop_test(x_np, w_np) print(y_test) assert_allclose(result, y_test)
def atestBlocksparseMatMulCPU(self): # n, m = 64*8, 64 # #layout = networkx.generators.barabasi_albert_graph(n, m) # layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .2) # layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32) # layout[0:m,0:m] = 1 # blocks = layout.sum() # print(100 * blocks / n**2) # print(layout.sum(axis=0).max()) with self.test_session(config=conf) as sess, tf.device("/cpu:0"): for bsize, axis in ((32, 0), (16, 0), (8, 0)): # (32,0), (16,0), (8,0) layout = np.ones((4 * 1024 // bsize, 4 * 1024 // bsize), dtype=np.int32) bsmm = BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") if one: W = np.ones(bsmm.w_shape, dtype=np.float32) X = np.ones(bsmm.i_shape(1), dtype=np.float32) else: W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(1)).astype(np.float32) w = tf.constant(W) x = tf.constant(X) y = sess.run(bsmm(x, w, bench=bench)) #start = time() Y = bsmm.fprop_test(X, W) #print("np time:", round(time() - start, 2)) difY = abs(Y - y) avgval = np.average(abs(Y)) maxdif = difY.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difY).sum()) / np.sqrt( np.square(Y).sum()) print("cpu max_err%%: %11.8f L2_err: %12.10f" % (100 * max_err, l2_err))
def test_blocksparse_simple(): init_blocksparse() from blocksparse.matmul import BlocksparseMatMul import tensorflow as tf import numpy as np hidden_size = 4096 block_size = 32 minibatch_size = 64 # Create a (random) sparsity pattern sparsity = np.random.randint(2, size=(hidden_size // block_size, hidden_size // block_size)) # Initialize the sparse matrix multiplication object bsmm = BlocksparseMatMul(sparsity, block_size=block_size, feature_axis=0) # Input to graph x = tf.placeholder(tf.float32, shape=[hidden_size, None]) x_np = np.ones((hidden_size, minibatch_size), dtype='float32') # Initialize block-sparse weights w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=3)) # Block-sparse matrix multiplication y = bsmm(x, w) # Run print('init vars') session.run(tf.global_variables_initializer()) print('blocksparse matmul') result = session.run(y, feed_dict={x: x_np}) print(result) print('test') w_np = session.run(w) y_test = bsmm.fprop_test(x_np, w_np) print(y_test) i = numpy.argmax((y_test - result)**2) print('biggest diff at %i: %r vs %r' % (i, y_test.flatten()[i], result.flatten()[i])) assert_allclose(result, y_test, rtol=1e-2) # rtol=1e-03 still fails
def test_blocksparse_simple(): init_blocksparse() from blocksparse.matmul import BlocksparseMatMul import tensorflow as tf import numpy as np hidden_size = 4096 block_size = 32 minibatch_size = 64 # Create a (random) sparsity pattern sparsity = np.random.randint(2, size=(hidden_size // block_size, hidden_size // block_size)) # Initialize the sparse matrix multiplication object bsmm = BlocksparseMatMul(sparsity, block_size=block_size, feature_axis=0) # Input to graph x = tf.placeholder(tf.float32, shape=[None, hidden_size]) # Initialize block-sparse weights w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32) # Block-sparse matrix multiplication y = bsmm(x, w) # Run session.run(tf.global_variables_initializer()) result = session.run( [y], feed_dict={x: np.ones((minibatch_size, hidden_size), dtype='float32')}) print(result)
def test_blocksparse_simple_identity(): init_blocksparse() from blocksparse.matmul import BlocksparseMatMul import tensorflow as tf import numpy n_in = 64 n_out = 32 * 32 block_size = 32 # Note: It seems everything less than 4 fails, as well as non-power-of-2. n_batch = 4 # Create a dense sparsity pattern mask = numpy.ones((n_in // block_size, n_out // block_size), dtype=numpy.int32) # MatMul object bsmm = BlocksparseMatMul(mask, block_size=block_size, feature_axis=0, name="bsmm") # Input x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape( (n_in, n_batch)) + 1.0 x = tf.constant(x_np, name='x') # Block-sparse weights w_np = bsmm.identity_init()() w = tf.constant(w_np, name="w") #for b in range(bsmm.blocks): # cb, kb = bsmm.updat_list[b] # print("block %i/%i, cb %i/%i, kb %i/%i" % (b, bsmm.blocks, cb, bsmm.KB, kb, bsmm.CB)) # Block-sparse matrix multiplication y = bsmm(x, w) y.set_shape((n_out, n_batch)) # Run result = session.run(y) print(result) print('L2:', numpy.sum(result**2)) y_test = bsmm.fprop_test(x_np, w_np) print(y_test) i = numpy.argmax((y_test - result)**2) print('biggest diff at %i: %r vs %r' % (i, y_test.flatten()[i], result.flatten()[i])) assert_allclose(result, y_test, rtol=1e-2)
def test_sparse_dense_bsr_gray(minibatch_size, N, K, BS_R, BS_C, density): """Run and profile BSR dense with tensorflow""" print("testing param", minibatch_size, N, K, BS_R, BS_C, density) # Initialize the sparse matrix multiplication object feature_axis = 0 if BS_R in [8, 16] else 1 # Create a (random) sparsity pattern sparsity = random_bsr_matrix_helper(K, N, BS_R, BS_C, density, 'float32') bsmm = BlocksparseMatMul(sparsity, block_size=BS_R, feature_axis=feature_axis) # Initialize block-sparse weights w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32) if feature_axis == 0: # Input to graph x = tf.get_variable("x", [K, minibatch_size], dtype=tf.float32) else: # Input to graph x = tf.get_variable("x", [minibatch_size, K], dtype=tf.float32) # Block-sparse matrix multiplication y = bsmm(x, w) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) run_metadata = tf.RunMetadata() sess.run([y], run_metadata=run_metadata, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)) # Print to stdout an analysis of the memory usage and the timing information # broken down by python codes. ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder # Print to stdout an analysis of the memory usage and the timing information # broken down by operation types. tf.profiler.profile( tf.get_default_graph(), run_meta=run_metadata, cmd='op', options=tf.profiler.ProfileOptionBuilder.time_and_memory()) tf.reset_default_graph()
def bench_mm_openai(x, w, mode, trans_a, trans_b, layout, block, num_repeat): # import and disable all logging import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import warnings warnings.filterwarnings('ignore',category=FutureWarning) from blocksparse.matmul import BlocksparseMatMul from blocksparse.transformer import BlocksparseTransformer import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) import numpy as np sparsity = layout.cpu().numpy() # create operator transformer = BlocksparseTransformer(sparsity, heads=layout.shape[0], block_size=block) dot_sdd_nt = transformer.nt_op dot_dsd_tn = transformer.tn_op dot_dsd_nn = transformer.nn_op dot_dds_nn = None if mode != 'dds' else BlocksparseMatMul(sparsity[0,:,:], block_size=block) key = (mode, trans_a, trans_b) ops = {('sdd', False, True): dot_sdd_nt, ('dsd', True, False): dot_dsd_tn, ('dsd', False, False): dot_dsd_nn, ('dds', False, False): dot_dds_nn} if x.dtype == torch.float32 and (mode == 'dsd' or block != 32): return None if key not in ops: return None if mode == 'dds' and x.shape[0]*x.shape[1] != 1: return None op = ops[key] # placeholder x = x.view(x.shape[0]*x.shape[1], x.shape[2], x.shape[3]) w = w.view(w.shape[0]*w.shape[1], w.shape[2], w.shape[3]) sparse_shape = [x.shape[0], layout.shape[0], layout[0].sum(), block, block] vx = tf.placeholder(tf.float32, shape = sparse_shape if mode == 'dsd' else x.shape) vw = tf.placeholder(tf.float32, shape = sparse_shape if mode == 'dds' else w.shape) x = np.random.rand(*sparse_shape) if mode == 'dsd' else x.cpu().detach().numpy() w = np.random.rand(*sparse_shape) if mode == 'dds' else w.cpu().detach().numpy() # Block-sparse matrix multiplication y = op(vx, vw, bench=num_repeat) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y], feed_dict = {vx: x, vw: w}) sess.close()
def profile(batch_size, input_size, output_size, block_size, sparsity): num_input_blocks = input_size / block_size num_output_blocks = output_size / block_size num_blocks = num_input_blocks * num_output_blocks num_pruned_blocks = int(num_blocks * sparsity) num_remain_blocks = num_blocks - num_pruned_blocks actual_sparsity = num_pruned_blocks / float(num_blocks) # generate layout layout = np.array([0] * num_pruned_blocks + [1] * num_remain_blocks) np.random.shuffle(layout) layout = layout.reshape((num_input_blocks, num_output_blocks)) # generate shuffle order indices = range(output_size) random.shuffle(indices) tf.reset_default_graph() with tf.Session() as sess: bsmm = BlocksparseMatMul(layout, block_size=block_size) i = tf.constant(indices) x = tf.placeholder(tf.float32, shape=(batch_size, input_size)) w = tf.get_variable('w', bsmm.w_shape, dtype=tf.float32) y = bsmm(x, w) y = tf.gather(y, i, axis=1) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(tf.global_variables_initializer()) sess.run(y, feed_dict={x: np.ones((batch_size, input_size), dtype='float32')}, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(chrome_trace) with open('timeline.json', 'r') as f: o = json.load(f)['traceEvents'] mm_time = int(next(item for item in o if item['name'] == u'BlocksparseMatmul')['dur']) gather_time = int(next(item for item in o if item['name'].startswith(u'Gather'))['dur']) os.remove('timeline.json') return actual_sparsity, mm_time + gather_time
def gen_masks(self): hps = self.hps hps.bsmm = bsmm = dict() assert hps.nhidden % hps.block_size == 0 assert hps.nembd % 32 == 0 # Create block-sparse matmul ops (to be shared by all instances of the model) # We only need 1 instance of the lut constants with tf.name_scope("BlocksparseMatMul"): if hps.nproj_in != hps.nhidden: # assume small projection values are acutally strides if hps.nproj_in <= hps.block_size * 4: hps.sproj_mul = SparseProj(hps.nhidden, proj_stride=hps.nproj_in) hps.sproj_add = SparseProj(hps.nhidden, proj_stride=hps.nproj_in) hps.nproj_in = hps.sproj_mul.nproj else: hps.sproj_mul = SparseProj(hps.nhidden, nproj=hps.nproj_in) hps.sproj_add = SparseProj(hps.nhidden, nproj=hps.nproj_in) else: hps.sproj_mul = None hps.sproj_add = None if hps.nproj_out != hps.nhidden: # assume small projection values are acutally strides if hps.nproj_out <= hps.block_size * 4: hps.sproj_out = SparseProj(hps.nhidden, proj_stride=hps.nproj_out, block_size=32) hps.nproj_out = hps.sproj_out.nproj else: hps.sproj_out = SparseProj(hps.nhidden, nproj=hps.nproj_out) else: hps.sproj_out = None # for the input and output projections, use the largest block size that fits blk_in, nproj_in = largest_block(hps.nproj_in) blk_out, nproj_out = largest_block(hps.nproj_out) nhidden = hps.nhidden // hps.block_size nembd = hps.nembd // blk_in nvocab = ceil_div(hps.nvocab, blk_out) # the dense input mask mask = np.ones((nembd, nproj_in), dtype=np.int32) bsmm["x"] = BlocksparseMatMul(mask, block_size=blk_in, feature_axis=hps.axis, name="lstm_x") istep_masks = [] if hps.share_masks: # all gates and internal steps get the same mask mask = masks.make_mask(n=nhidden, kind=hps.sparsity) bsmm_p = BlocksparseMatMul(mask, block_size=hps.block_size, feature_axis=hps.axis, name="lstm_h") for p in list("ifou") + ["h%d" % i for i in range(hps.isteps)]: bsmm[p] = bsmm_p istep_masks = [mask for i in range(hps.isteps + 1)] else: # internal steps get different masks for p in ["h%d" % i for i in range(hps.isteps)]: mask = masks.make_mask(n=nhidden, kind=hps.sparsity) bsmm[p] = BlocksparseMatMul(mask, block_size=hps.block_size, feature_axis=hps.axis, name="lstm_%s" % p) istep_masks.append(mask) # gates get the same mask (TODO: experiment here with differen masks) mask = masks.make_mask(n=nhidden, kind=hps.sparsity) bsmm_p = BlocksparseMatMul(mask, block_size=hps.block_size, feature_axis=hps.axis, name="lstm_g") for p in list("ifou"): bsmm[p] = bsmm_p istep_masks.append(mask) # the output mask mask = np.ones((nproj_out, nvocab), dtype=np.int32) bsmm["y"] = BlocksparseMatMul(mask, block_size=blk_out, feature_axis=hps.axis, name="lstm_o") hps.mix_factor = masks.mix_factor(istep_masks) hps.sparsity += " (%.4f%%)" % (100.0 * bsmm["u"].sparsity)
def testBlocksparseMatMul(self): # layout = np.zeros((2,2), dtype=np.int32) # layout[0,0] = 1 n, m = 56 * 8, 8 layout = networkx.generators.barabasi_albert_graph(n, m) #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5) layout = networkx.adjacency_matrix(layout).toarray().astype( np.int32) + np.eye(n, dtype=np.int32) layout[0:m, 0:m] = 1 #layout[0:60,0:60] = 1 #layout = np.zeros((4,4), dtype=np.int32) #layout = np.ones((28*12,28*12), dtype=np.int32) #layout[0,0] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) #exit() with self.test_session(config=conf) as sess, tf.device("/gpu:0"): for bsize, axis in ( (32, 1), (32, 0), (16, 0), (8, 0), ): # (32,1), (32,0), (16,0), (8,0) bsmm = BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") if one: W = np.ones(bsmm.w_shape, dtype=np.float32) #W[:] += np.arange(8, dtype=np.float32).reshape(1,8) else: W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:] w = tf.constant(W) # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) ) # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape) # print("identity_init: ", (s1 - s2).max()) for N in (64, ): # 128,64,32,16,1, if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) #X[:] += np.arange(8, dtype=np.float32).reshape(8,1) else: X = np.random.uniform( -1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform( -1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) x = tf.constant(X) e = tf.constant(E) for dtF, dtB in dtypes: print("Axis:%d Bsize:%2d N:%d F:%s B:%s Params:%d" % (axis, bsize, N, dtF.name, dtB.name, bsize * bsize * blocks)) # compute in tensorflow if l2norm: w2 = bsmm.l2_normalize(w, dtype=dtF) else: w2 = ew.float_cast(w, dtype=dtF) y = ew.float_cast(x, dtype=dtF) for j in range(depth): repeat = bench if bench and j == depth - 1 else 0 y = bsmm( y, w2, dw_dtype=dtF, bench=repeat ) # (bench and j==depth-1) (bench and j==0) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtB) if bench: sess.run(y) #y = sess.run( y ) d = tf.gradients(y, [x, w], e, aggregation_method=am) if depth > 1: d[1] = group_param_grads(d[1], 8) y, (dx, dw) = sess.run([y, d]) if not bench: # compute in numpy if l2norm: W2 = bsmm.l2_normalize_test(W) else: W2 = W # YY = np.dot(WW.T, X) # ZZ = np.dot(WW , E) # uu = np.dot( X , E.T) # UU = np.zeros(bsmm.w_shape, dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # UU[w,:,:] = uu[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] Ys = [X] for j in range(depth): Ys.append(bsmm.fprop_test(Ys[-1], W2)) Y = Ys.pop() DW = np.zeros(bsmm.w_shape, dtype=np.float32) DX = E for j in range(depth): DW += bsmm.updat_test(Ys.pop(), DX) DX = bsmm.bprop_test(DX, W2) if l2norm: DW = bsmm.l2_normalize_grad_test(W, DW) for op, cpuA, devA in ( # ("YY:", YY, y), # ("ZZ:", ZZ, dx), # ("UU:", UU, dw), (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt( np.square(difA).sum()) / np.sqrt( np.square(cpuA).sum()) #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err)) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100 * max_err, l2_err)) # rtol = 1e-4 if dtF is tf.float32 else 1e-1 # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol) if out: dim = bsmm.K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1, dim)), fmt='%5.1f') exit() print("")
def atestBlocksparseMatMulGated(self): with self.test_session(config=conf) as sess, tf.device("/gpu:0"): N = 128 K = 8 * 56 * 2 * 4 n = K // 8 m = 30 dtype = tf.bfloat16 repeat = 10000 layout = networkx.generators.barabasi_albert_graph(n, m) layout = networkx.adjacency_matrix(layout).toarray().astype( np.int32) + np.eye(n, dtype=np.int32) layout[0:m, 0:m] = 1 blocks = layout.sum() n = layout.shape[0] print(100 * blocks / n**2) print(layout.sum(axis=0).max()) # layout = np.ones((112,32), dtype=np.int32) bsmm = BlocksparseMatMul(layout, block_size=8, feature_axis=0, name="test") if one: X = np.ones(bsmm.i_shape(N), dtype=np.float32) E = np.ones(bsmm.o_shape(N), dtype=np.float32) W = np.ones(bsmm.w_shape, dtype=np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) else: X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(np.float32) G = np.ones(bsmm.blocks, dtype=np.float32) # for w, (c, k) in enumerate(bsmm.updat_list): # G[w] = (c & 1) ^ (k & 1) ^ 1 #G[::2] = 0.0 # block = dict() # for w, (c, k) in enumerate(bsmm.updat_list): # block[(c,k)] = w # grid = [] # for c in range(bsmm.CB): # row = [] # for k in range(bsmm.KB): # row.append(G[block[(c,k)]]) # grid.append(row) # for row in grid: # print(row) # exit() x = tf.constant(X) e = tf.constant(E) w = tf.constant(W) g = tf.constant(G) w2 = ew.float_cast(w, dtype=dtype) y = ew.float_cast(x, dtype=dtype) y = bsmm(y, w2, gate=g, bench=repeat) y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype) d = tf.gradients(y, [x, w], e) y, (dx, dw) = sess.run([y, d]) # gpu kernel doesn't touch zero gate blocks # for b in range(bsmm.blocks): # if G[b] == 0.0: # dw[b,:,:] = 0.0 Y = bsmm.fprop_test(X, W, gate=G) DX = bsmm.bprop_test(E, W, gate=G) DW = bsmm.updat_test(X, E, gate=G) #print(Y.shape, dtype) for op, cpuA, devA in ( (" y:", Y, y), ("dx:", DX, dx), ("dw:", DW, dw), ): difA = abs(cpuA - devA) avgval = np.average(abs(cpuA)) maxdif = difA.max() max_err = maxdif if avgval == 0 else maxdif / avgval l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt( np.square(cpuA).sum() + 1e-12) print("%s max_err%%:%11.8f L2_err: %12.10f" % (op, 100 * max_err, l2_err)) if out: dim = K if op == "dw:" else N np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outC.txt", cpuA.reshape((-1, dim)), fmt='%5.1f') np.savetxt("outD.txt", devA.reshape((-1, dim)), fmt='%5.1f') exit()
sys.path.insert(0, "..") # blocksparse base sys.path.insert(0, "../../..") # Returnn base from blocksparse.matmul import BlocksparseMatMul import tensorflow as tf import numpy as np hidden_size = 4096 block_size = 32 minibatch_size = 64 # Create a (random) sparsity pattern sparsity = np.random.randint(2, size=(hidden_size//block_size,hidden_size//block_size)) # Initialize the sparse matrix multiplication object bsmm = BlocksparseMatMul(sparsity, block_size=block_size) # Input to graph x = tf.placeholder(tf.float32, shape=[None, hidden_size]) # Initialize block-sparse weights w = tf.get_variable("w", bsmm.w_shape, dtype=tf.float32) # Block-sparse matrix multiplication y = bsmm(x, w) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y], feed_dict = {x: np.ones((minibatch_size,hidden_size), dtype='float32')}) print(result)
if mask == "ws": layout = networkx.generators.random_graphs.watts_strogatz_graph( n, m * 2, .2) layout = networkx.adjacency_matrix(layout).toarray( ).astype(np.int32) + np.eye(n, dtype=np.int32) else: layout = networkx.generators.barabasi_albert_graph(n, m) layout = networkx.adjacency_matrix(layout).toarray( ).astype(np.int32) + np.eye(n, dtype=np.int32) layout[0:m, 0:m] = 1 # print("axis:%d bsize:%2d hsize:%d params:%d sparsity:%.2f m:%d" % (axis, bsize, hsize, bsize*bsize*blks, spar, m)) # continue bsmm = BlocksparseMatMul(layout, block_size=bsize, feature_axis=axis, name="test") W = np.random.uniform(-1.0, 1.0, bsmm.w_shape).astype(np.float32) w = tf.constant(W) for N in (64, ): # 128,64,32,16,1, X = np.random.uniform(-1.0, 1.0, bsmm.i_shape(N)).astype(np.float32) E = np.random.uniform(-1.0, 1.0, bsmm.o_shape(N)).astype(np.float32) x = tf.constant(X) e = tf.constant(E) for dtype in (tf.bfloat16, ): # tf.bfloat16, tf.bfloat32,