Exemplo n.º 1
0
def test_blocksparse_simple_feature_axis1():
  init_blocksparse()

  from blocksparse.matmul import BlocksparseMatMul
  import tensorflow as tf
  import numpy

  n_in = 64
  n_out = 32 * 32
  block_size = 32
  n_batch = 4

  # Create a dense sparsity pattern
  mask = numpy.ones((n_in // block_size, n_out // block_size), dtype=numpy.int32)
  # MatMul object
  bsmm = BlocksparseMatMul(mask, block_size=block_size, feature_axis=1, name="bsmm")
  # Input
  x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape((n_batch, n_in)) + 1.0
  x = tf.constant(x_np, name='x')
  # Block-sparse weights
  w_np = bsmm.identity_init()()
  w = tf.constant(w_np, name="w")
  # Block-sparse matrix multiplication
  y = bsmm(x, w)
  y.set_shape((n_batch, n_out))
  # Run
  result = session.run(y)
  print(result)
  print('L2:', numpy.sum(result ** 2))
  y_test = bsmm.fprop_test(x_np, w_np)
  print(y_test)
  assert_allclose(result, y_test)
    def atestBlocksparseMatMulCPU(self):
        # n, m = 64*8, 64
        # #layout = networkx.generators.barabasi_albert_graph(n, m)
        # layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .2)
        # layout = networkx.adjacency_matrix(layout).toarray().astype(np.int32) + np.eye(n, dtype=np.int32)
        # layout[0:m,0:m] = 1

        # blocks = layout.sum()
        # print(100 * blocks / n**2)
        # print(layout.sum(axis=0).max())

        with self.test_session(config=conf) as sess, tf.device("/cpu:0"):
            for bsize, axis in ((32, 0), (16, 0),
                                (8, 0)):  # (32,0), (16,0), (8,0)

                layout = np.ones((4 * 1024 // bsize, 4 * 1024 // bsize),
                                 dtype=np.int32)

                bsmm = BlocksparseMatMul(layout,
                                         block_size=bsize,
                                         feature_axis=axis,
                                         name="test")

                if one:
                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    X = np.ones(bsmm.i_shape(1), dtype=np.float32)
                else:
                    W = np.random.uniform(-1.0, 1.0,
                                          bsmm.w_shape).astype(np.float32)
                    X = np.random.uniform(-1.0, 1.0,
                                          bsmm.i_shape(1)).astype(np.float32)

                w = tf.constant(W)
                x = tf.constant(X)
                y = sess.run(bsmm(x, w, bench=bench))

                #start = time()
                Y = bsmm.fprop_test(X, W)
                #print("np time:", round(time() - start, 2))

                difY = abs(Y - y)

                avgval = np.average(abs(Y))
                maxdif = difY.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difY).sum()) / np.sqrt(
                    np.square(Y).sum())

                print("cpu max_err%%: %11.8f L2_err: %12.10f" %
                      (100 * max_err, l2_err))
Exemplo n.º 3
0
def test_blocksparse_simple():
    init_blocksparse()

    from blocksparse.matmul import BlocksparseMatMul
    import tensorflow as tf
    import numpy as np

    hidden_size = 4096
    block_size = 32
    minibatch_size = 64

    # Create a (random) sparsity pattern
    sparsity = np.random.randint(2,
                                 size=(hidden_size // block_size,
                                       hidden_size // block_size))

    # Initialize the sparse matrix multiplication object
    bsmm = BlocksparseMatMul(sparsity, block_size=block_size, feature_axis=0)

    # Input to graph
    x = tf.placeholder(tf.float32, shape=[hidden_size, None])
    x_np = np.ones((hidden_size, minibatch_size), dtype='float32')

    # Initialize block-sparse weights
    w = tf.get_variable("w",
                        bsmm.w_shape,
                        dtype=tf.float32,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=3))

    # Block-sparse matrix multiplication
    y = bsmm(x, w)

    # Run
    print('init vars')
    session.run(tf.global_variables_initializer())
    print('blocksparse matmul')
    result = session.run(y, feed_dict={x: x_np})
    print(result)
    print('test')
    w_np = session.run(w)
    y_test = bsmm.fprop_test(x_np, w_np)
    print(y_test)
    i = numpy.argmax((y_test - result)**2)
    print('biggest diff at %i: %r vs %r' %
          (i, y_test.flatten()[i], result.flatten()[i]))
    assert_allclose(result, y_test, rtol=1e-2)  # rtol=1e-03 still fails
Exemplo n.º 4
0
def test_blocksparse_simple_identity():
    init_blocksparse()

    from blocksparse.matmul import BlocksparseMatMul
    import tensorflow as tf
    import numpy

    n_in = 64
    n_out = 32 * 32
    block_size = 32
    # Note: It seems everything less than 4 fails, as well as non-power-of-2.
    n_batch = 4

    # Create a dense sparsity pattern
    mask = numpy.ones((n_in // block_size, n_out // block_size),
                      dtype=numpy.int32)
    # MatMul object
    bsmm = BlocksparseMatMul(mask,
                             block_size=block_size,
                             feature_axis=0,
                             name="bsmm")
    # Input
    x_np = numpy.arange(n_in * n_batch, dtype=numpy.float32).reshape(
        (n_in, n_batch)) + 1.0
    x = tf.constant(x_np, name='x')
    # Block-sparse weights
    w_np = bsmm.identity_init()()
    w = tf.constant(w_np, name="w")
    #for b in range(bsmm.blocks):
    #  cb, kb = bsmm.updat_list[b]
    #  print("block %i/%i, cb %i/%i, kb %i/%i" % (b, bsmm.blocks, cb, bsmm.KB, kb, bsmm.CB))
    # Block-sparse matrix multiplication
    y = bsmm(x, w)
    y.set_shape((n_out, n_batch))
    # Run
    result = session.run(y)
    print(result)
    print('L2:', numpy.sum(result**2))
    y_test = bsmm.fprop_test(x_np, w_np)
    print(y_test)
    i = numpy.argmax((y_test - result)**2)
    print('biggest diff at %i: %r vs %r' %
          (i, y_test.flatten()[i], result.flatten()[i]))
    assert_allclose(result, y_test, rtol=1e-2)
    def testBlocksparseMatMul(self):

        # layout = np.zeros((2,2), dtype=np.int32)
        # layout[0,0] = 1

        n, m = 56 * 8, 8
        layout = networkx.generators.barabasi_albert_graph(n, m)
        #layout = networkx.generators.random_graphs.watts_strogatz_graph(n, m*2, .5)
        layout = networkx.adjacency_matrix(layout).toarray().astype(
            np.int32) + np.eye(n, dtype=np.int32)
        layout[0:m, 0:m] = 1

        #layout[0:60,0:60] = 1
        #layout = np.zeros((4,4), dtype=np.int32)
        #layout = np.ones((28*12,28*12), dtype=np.int32)
        #layout[0,0] = 1

        blocks = layout.sum()
        n = layout.shape[0]
        print(100 * blocks / n**2)
        print(layout.sum(axis=0).max())
        #exit()

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            for bsize, axis in (
                (32, 1),
                (32, 0),
                (16, 0),
                (8, 0),
            ):  # (32,1), (32,0), (16,0), (8,0)

                bsmm = BlocksparseMatMul(layout,
                                         block_size=bsize,
                                         feature_axis=axis,
                                         name="test")

                if one:
                    W = np.ones(bsmm.w_shape, dtype=np.float32)
                    #W[:] += np.arange(8, dtype=np.float32).reshape(1,8)
                else:
                    W = np.random.uniform(-1.0, 1.0,
                                          bsmm.w_shape).astype(np.float32)

                # WW = np.zeros((bsmm.C, bsmm.K), dtype=np.float32)
                # for w, (c, k) in enumerate(bsmm.updat_list):
                #     WW[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize] = W[w,:,:]

                w = tf.constant(W)

                # s1 = sess.run( bsmm.identity_init(gpu=True)(bsmm.w_shape) )
                # s2 = bsmm.identity_init(gpu=False)(bsmm.w_shape)
                # print("identity_init: ", (s1 - s2).max())

                for N in (64, ):  # 128,64,32,16,1,

                    if one:
                        X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                        E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                        #X[:] += np.arange(8, dtype=np.float32).reshape(8,1)
                    else:
                        X = np.random.uniform(
                            -1.0, 1.0, bsmm.i_shape(N)).astype(np.float32)
                        E = np.random.uniform(
                            -1.0, 1.0, bsmm.o_shape(N)).astype(np.float32)

                    x = tf.constant(X)
                    e = tf.constant(E)

                    for dtF, dtB in dtypes:

                        print("Axis:%d Bsize:%2d N:%d F:%s B:%s Params:%d" %
                              (axis, bsize, N, dtF.name, dtB.name,
                               bsize * bsize * blocks))

                        # compute in tensorflow
                        if l2norm:
                            w2 = bsmm.l2_normalize(w, dtype=dtF)
                        else:
                            w2 = ew.float_cast(w, dtype=dtF)

                        y = ew.float_cast(x, dtype=dtF)

                        for j in range(depth):
                            repeat = bench if bench and j == depth - 1 else 0
                            y = bsmm(
                                y, w2, dw_dtype=dtF, bench=repeat
                            )  # (bench and j==depth-1) (bench and j==0)

                        y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtB)
                        if bench: sess.run(y)
                        #y = sess.run( y )

                        d = tf.gradients(y, [x, w], e, aggregation_method=am)
                        if depth > 1:
                            d[1] = group_param_grads(d[1], 8)

                        y, (dx, dw) = sess.run([y, d])

                        if not bench:
                            # compute in numpy
                            if l2norm:
                                W2 = bsmm.l2_normalize_test(W)
                            else:
                                W2 = W

                            # YY = np.dot(WW.T, X)
                            # ZZ = np.dot(WW  , E)
                            # uu = np.dot( X  , E.T)
                            # UU = np.zeros(bsmm.w_shape, dtype=np.float32)
                            # for w, (c, k) in enumerate(bsmm.updat_list):
                            #     UU[w,:,:] = uu[c*bsize:(c+1)*bsize, k*bsize:(k+1)*bsize]

                            Ys = [X]
                            for j in range(depth):
                                Ys.append(bsmm.fprop_test(Ys[-1], W2))
                            Y = Ys.pop()

                            DW = np.zeros(bsmm.w_shape, dtype=np.float32)
                            DX = E
                            for j in range(depth):
                                DW += bsmm.updat_test(Ys.pop(), DX)
                                DX = bsmm.bprop_test(DX, W2)
                            if l2norm:
                                DW = bsmm.l2_normalize_grad_test(W, DW)

                            for op, cpuA, devA in (
                                    # ("YY:", YY,  y),
                                    # ("ZZ:", ZZ, dx),
                                    # ("UU:", UU, dw),
                                (" y:", Y, y),
                                ("dx:", DX, dx),
                                ("dw:", DW, dw),
                            ):

                                difA = abs(cpuA - devA)

                                avgval = np.average(abs(cpuA))
                                maxdif = difA.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval

                                l2_err = np.sqrt(
                                    np.square(difA).sum()) / np.sqrt(
                                        np.square(cpuA).sum())

                                #print("max_err: %5.3f, max_val: %7.3f, l1_err: %7.5f, l2_err: %7.5f" % (difO.max(), cpuO.max(), l1_err, l2_err))

                                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                                      (op, 100 * max_err, l2_err))

                                # rtol = 1e-4 if dtF is tf.float32 else 1e-1
                                # self.assertAllClose(devA, cpuA, rtol=rtol, atol=rtol)
                                if out:
                                    dim = bsmm.K if op == "dw:" else N
                                    np.savetxt("out.txt",
                                               difA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outC.txt",
                                               cpuA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    np.savetxt("outD.txt",
                                               devA.reshape((-1, dim)),
                                               fmt='%5.1f')
                                    exit()
                            print("")
    def atestBlocksparseMatMulGated(self):

        with self.test_session(config=conf) as sess, tf.device("/gpu:0"):

            N = 128
            K = 8 * 56 * 2 * 4
            n = K // 8
            m = 30
            dtype = tf.bfloat16
            repeat = 10000

            layout = networkx.generators.barabasi_albert_graph(n, m)
            layout = networkx.adjacency_matrix(layout).toarray().astype(
                np.int32) + np.eye(n, dtype=np.int32)
            layout[0:m, 0:m] = 1

            blocks = layout.sum()
            n = layout.shape[0]
            print(100 * blocks / n**2)
            print(layout.sum(axis=0).max())

            # layout = np.ones((112,32), dtype=np.int32)
            bsmm = BlocksparseMatMul(layout,
                                     block_size=8,
                                     feature_axis=0,
                                     name="test")

            if one:
                X = np.ones(bsmm.i_shape(N), dtype=np.float32)
                E = np.ones(bsmm.o_shape(N), dtype=np.float32)
                W = np.ones(bsmm.w_shape, dtype=np.float32)
                G = np.ones(bsmm.blocks, dtype=np.float32)
            else:
                X = np.random.uniform(-1.0, 1.0,
                                      bsmm.i_shape(N)).astype(np.float32)
                E = np.random.uniform(-1.0, 1.0,
                                      bsmm.o_shape(N)).astype(np.float32)
                W = np.random.uniform(-1.0, 1.0,
                                      bsmm.w_shape).astype(np.float32)
                G = np.random.uniform(0.0, 1.0, bsmm.blocks).astype(np.float32)

            G = np.ones(bsmm.blocks, dtype=np.float32)
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     G[w] = (c & 1) ^ (k & 1) ^ 1

            #G[::2] = 0.0

            # block = dict()
            # for w, (c, k) in enumerate(bsmm.updat_list):
            #     block[(c,k)] = w

            # grid = []
            # for c in range(bsmm.CB):
            #     row = []
            #     for k in range(bsmm.KB):
            #         row.append(G[block[(c,k)]])
            #     grid.append(row)

            # for row in grid:
            #     print(row)

            # exit()

            x = tf.constant(X)
            e = tf.constant(E)
            w = tf.constant(W)
            g = tf.constant(G)

            w2 = ew.float_cast(w, dtype=dtype)
            y = ew.float_cast(x, dtype=dtype)

            y = bsmm(y, w2, gate=g, bench=repeat)

            y = ew.float_cast(y, dtype=tf.float32, dx_dtype=dtype)

            d = tf.gradients(y, [x, w], e)

            y, (dx, dw) = sess.run([y, d])

            # gpu kernel doesn't touch zero gate blocks
            # for b in range(bsmm.blocks):
            #     if G[b] == 0.0:
            #         dw[b,:,:] = 0.0

            Y = bsmm.fprop_test(X, W, gate=G)
            DX = bsmm.bprop_test(E, W, gate=G)
            DW = bsmm.updat_test(X, E, gate=G)

            #print(Y.shape, dtype)

            for op, cpuA, devA in (
                (" y:", Y, y),
                ("dx:", DX, dx),
                ("dw:", DW, dw),
            ):

                difA = abs(cpuA - devA)

                avgval = np.average(abs(cpuA))
                maxdif = difA.max()
                max_err = maxdif if avgval == 0 else maxdif / avgval

                l2_err = np.sqrt(np.square(difA).sum()) / np.sqrt(
                    np.square(cpuA).sum() + 1e-12)

                print("%s max_err%%:%11.8f L2_err: %12.10f" %
                      (op, 100 * max_err, l2_err))

                if out:
                    dim = K if op == "dw:" else N
                    np.savetxt("out.txt", difA.reshape((-1, dim)), fmt='%5.1f')
                    np.savetxt("outC.txt",
                               cpuA.reshape((-1, dim)),
                               fmt='%5.1f')
                    np.savetxt("outD.txt",
                               devA.reshape((-1, dim)),
                               fmt='%5.1f')
                    exit()