Пример #1
0
def train_hetu(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    mask_ = ad.Variable(name="mask_")
    gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1)
    gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1)

    x = gcn1(x_)
    x = gcn2(x)
    W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"]))
    B = initializers.zeros(shape=(meta["class"],))
    x = ad.matmul_op(x, W)
    y = x + ad.broadcastto_op(B, x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.mul_op(loss, mask_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS')
    distributed.ps_init(rank, nrank)

    batch_size = 4000
    with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler:
        epoch = 0
        nnodes = 0
        start = time.time()
        while True:
            g_sample, mask = sampler.sample()
            mp_val = mp_matrix(g_sample, ndarray.gpu(rank))
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                mask_ : ndarray.array(mask, ctx=ctx),
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = ((y_predicted == g_sample.y) * mask).sum()
            distributed.ps_get_worker_communicator().BarrierWorker()
            nnodes += batch_size
            if nnodes > meta["partition"]["nodes"][rank]:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/mask.sum())
                start = time.time()
                if epoch >= num_epoch:
                    break
Пример #2
0
    def __init__(self, stream=None, mpi_init=True):
        '''
            mpicomm: the MPI communicator, to use in MPI_Bcast, MPI_Reduce, MPI_Scatter, etc
            ncclcomm: the NCCL communicator, to use in ncclAllReduce ...
            nRanks: the total number of MPI threads
            myRanks: the rank in all MPI threads
            localRank: the rank among the MPI threads in this device
            ncclId: ncclGetUniqueId should be called once when creating a communicator 
                    and the Id should be distributed to all ranks in the communicator before calling ncclCommInitRank.
            stream: the stream for NCCL communication
        '''
        self.mpicomm = c_int64(0)
        self.ncclcomm = c_int64(0)
        self.nRanks = c_int32(0)
        self.myRank = c_int32(0)
        self.localRank = c_int32(-1)
        self.ncclId = ncclUniqueId()
        self.device_id = c_int(0)

        if mpi_init:
            self.MPI_Init()
        self.groupComm_flag = False
        self.MPIGetComm()
        self.MPI_Comm_rank()
        self.MPI_Comm_size()
        self.getLocalRank()

        self.device_id.value = self.localRank.value

        if stream == None:
            self.stream = create_stream_handle(
                ndarray.gpu(self.device_id.value))
        else:
            self.stream = stream
Пример #3
0
def test_uniform(size, lb=-1, ub=1):
    ctx = ndarray.gpu(0)
    cuda_x = ndarray.empty(size, ctx=ctx)
    stre = stream.create_stream_handle(ctx)
    np_st = time()
    for i in range(10):
        x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32)
        cuda_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cu_st = time()
    for i in range(10):
        gpu_op.uniform_init(cuda_x, lb, ub, 123, stre)
    stre.sync()
    cu_en = time()
    print('cuda time: ', cu_en - cu_st)
    fig, ax = plt.subplots(1, 1)
    cuda_x = cuda_x.asnumpy()
    assert (cuda_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cuda_x.flatten(),
            histtype='step',
            alpha=0.2,
            bins=50,
            label='cuda')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'uniform_%f_%f.png' % (lb, ub)
    plt.savefig(file_name)
    plt.close()
Пример #4
0
def test_truncated_normal(size, mean=0, std=1):
    ctx = ndarray.gpu(0)
    cuda_x = ndarray.empty(size, ctx=ctx)
    stre = stream.create_stream_handle(ctx)
    np_st = time()
    for i in range(10):
        x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std,
                          size=size).astype(np.float32)
        cuda_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cu_st = time()
    for i in range(10):
        gpu_op.truncated_normal_init(cuda_x, mean, std, 123, stre)
    stre.sync()
    cu_en = time()
    print('cuda time: ', cu_en - cu_st)
    fig, ax = plt.subplots(1, 1)
    cuda_x = cuda_x.asnumpy()
    assert (cuda_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cuda_x.flatten(),
            histtype='step',
            alpha=0.2,
            bins=50,
            label='cuda')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'truncated_normal_%f_%f.png' % (mean, std)
    plt.savefig(file_name)
    plt.close()
Пример #5
0
def test_layernorm_forward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    bias = np.random.random((last_dim,)).astype(np.float32)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_bias = ndarray.array(bias, ctx=ctx)
    arr_mean = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_var = ndarray.empty(list(shape[:-1]) + [1], ctx=ctx)
    arr_y = ndarray.empty((shape), ctx=ctx)
    gpu_op.layer_normalization(arr_x, arr_scale, arr_bias, arr_mean, arr_var, arr_y, 0.01)

    y = arr_y.asnumpy()

    np_means = x.mean(axis=-1, dtype=np.float32, keepdims=True)
    np_vars = x.var(axis=-1, dtype=np.float32, keepdims=True)
    std = np.sqrt(np_vars + 0.01, dtype=np.float32)
    centered_input = x - np_means
    normed_input = centered_input / std

    bc_shape = [1] * len(x.shape)
    bc_shape[-1] = x.shape[-1]

    y_ = scale.reshape(bc_shape) * normed_input + \
        bias.reshape(bc_shape)
    
    np.testing.assert_allclose(np_means, arr_mean.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(np_vars, arr_var.asnumpy(), atol=1e-6)
    np.testing.assert_allclose(y_, y, atol=1e-6)
    print('Pass forward test with shape ', shape)
Пример #6
0
def train_hetu(num_epoch):
    ctx = ndarray.gpu(0)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")

    if use_same_init:
        gcn1 = GCN(num_features, hidden_layer_size, custom_init=(init_w1, init_b1))
        gcn2 = GCN(hidden_layer_size, num_classes, custom_init=(init_w2, init_b2))
    else:
        gcn1 = GCN(num_features, hidden_layer_size)
        gcn2 = GCN(hidden_layer_size, num_classes)

    mp_val = mp_matrix(graph, ctx, use_original_gcn_norm=True)
    feed_dict = {
        gcn1.mp : mp_val,
        gcn2.mp : mp_val,
        x_ : ndarray.array(graph.x, ctx=ctx),
        y_ : ndarray.array(convert_to_one_hot(graph.y, max_val=num_classes), ctx=ctx)
    }

    x = gcn1(x_)
    x = ad.relu_op(x)
    y = gcn2(x)

    loss = ad.softmaxcrossentropy_op(y, y_)

    opt = optimizer.AdamOptimizer(0.01)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx)
    start_time = time.time()
    losses = []
    for i in range(num_epoch):
        loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)

        y_predicted = y_predicted.asnumpy().argmax(axis=1)
        acc = (y_predicted == graph.y).sum()
        losses.append(loss_val.asnumpy().mean())
        if i==0:
            start_time= time.time()
        print("Train loss :", loss_val.asnumpy().mean())
        print("Train accuracy:", acc/len(y_predicted))
        print("Hetu time:",i, time.time()-start_time)
    print("Hetu time:", time.time()-start_time)

    mp_val = mp_matrix(graph_full, ctx)

    feed_dict = {
        gcn1.mp : mp_val,
        gcn2.mp : mp_val,
        x_ : ndarray.array(graph_full.x, ctx=ctx),
    }
    executor_eval = ad.Executor([y], ctx=ctx)
    y_predicted, = executor_eval.run(feed_dict=feed_dict)
    y_predicted = y_predicted.asnumpy().argmax(axis=1)
    acc = (y_predicted == graph_full.y)[train_split:].sum()
    print("Test accuracy:", acc/len(y_predicted[train_split:]))
    return losses
Пример #7
0
 def transform(result):
     [graph, sample_mask] = result
     train_mask = np.zeros(node_upper_bound)
     train_mask[0:graph.num_nodes] = sample_mask * graph.x[:, -1]
     test_mask = np.zeros(node_upper_bound)
     test_mask[0:graph.num_nodes] = (sample_mask -
                                     graph.x[:, -1]) * sample_mask
     graph = padding(graph, node_upper_bound)
     mp_val = mp_matrix(graph, ndarray.gpu(rank % args.num_local_worker))
     return graph, mp_val, train_mask, test_mask
Пример #8
0
def test_sparse_matrix_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx)
	mat_y = ndarray.array(y, ctx=ctx)
	mat_z = ndarray.empty((500, 100), ctx=ctx)
	gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z)
	z = mat_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
Пример #9
0
def train_hetu(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    hosts, ports = load_ip_config(args.ip_config)
    ctx = ndarray.gpu(rank)
    distributed.grpc_init(hosts=hosts, ports=ports, rank=rank, nrank=nrank)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    gcn1 = GCN(meta["feature"], hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    x = gcn1(x_)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS')

    def transform(graph):
        mp_val = mp_matrix(graph, ndarray.gpu(rank))
        return graph, mp_val
    with DistributedSubgraphSampler(args.path, 4000, 2, rank=rank, nrank=nrank ,transformer=transform, backend="grpc") as sampler:
        epoch = 0
        nnodes = 0
        start = time.time()
        while True:
            g_sample, mp_val = sampler.sample()
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = (y_predicted == g_sample.y).sum()
            distributed.ps_get_worker_communicator().BarrierWorker()
            nnodes += g_sample.num_nodes
            if nnodes > meta["partition"]["nodes"][rank]:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/len(y_predicted))
                start = time.time()
                if epoch >= num_epoch:
                    break
Пример #10
0
def test_allreduce(comm=None):
    shape = (24, 24)
    size = 4
    for val in shape:
        size *= val
    input_arr = np.ones(shape) * comm.localRank.value
    input_arr = ndarray.array(input_arr, ctx=ndarray.gpu(comm.localRank.value))
    # input_arr = ndarray.array(input_arr, ctx = ndarray.cpu())

    start = time.time()
    comm.dlarrayNcclAllReduce(input_arr, ncclDataType_t.ncclFloat32,
                              ncclRedOp_t.ncclSum)
    comm.stream.sync()
    end = time.time()

    secs = end - start

    return size, secs
Пример #11
0
def test_layernorm_backward(shape=(5, 3)):
    ctx = ndarray.gpu(1)
    # shape = (5, 3)
    last_dim = shape[-1]
    grads = np.random.random(shape).astype(np.float32)
    x = np.random.random(shape).astype(np.float32)
    scale = np.random.random((last_dim,)).astype(np.float32)
    mean = np.random.random(list(shape[:-1])+[1]).astype(np.float32)
    var = np.random.random(list(shape[:-1])+[1]).astype(np.float32)

    arr_grads = ndarray.array(grads, ctx=ctx)
    arr_x = ndarray.array(x, ctx=ctx)
    arr_scale = ndarray.array(scale, ctx=ctx)
    arr_mean = ndarray.array(mean, ctx=ctx)
    arr_var = ndarray.array(var, ctx=ctx)

    grad_inarr = ndarray.empty(shape, ctx=ctx)
    grad_scale = ndarray.empty((last_dim,), ctx=ctx)
    grad_bias = ndarray.empty((last_dim,), ctx=ctx)
    gpu_op.layer_normalization_gradient(arr_grads, arr_x, arr_scale,
        grad_inarr, grad_scale, grad_bias, arr_mean, arr_var, 0.01)

    # numpy calculate phase
    red_axis = tuple(range(grads.ndim-1))
    np_grad_bias = grads.sum(red_axis) # (X,)
    
    std = np.sqrt(var + 0.01) # (N, 1)
    x_centered = x - mean # (N, X)
    x_norm = x_centered / std # (N, X)
    np_grad_scale = (grads * x_norm).sum(red_axis) # (X,)

    last_dim = x.shape[-1]
    dx_norm = grads * scale.reshape([1] * (grads.ndim - 1) + [-1]) # (N, X)
    dvar = (dx_norm * x_centered).sum(axis=-1, keepdims=True) * -0.5 / (var + 0.01) / std # (N, 1)
    dx_mu_1 = dx_norm / std # (N, X)
    dx_mu_2 = dvar * 2 * x_centered / last_dim # (N, X)
    dx_1 = dx_mu_1 + dx_mu_2 # (N, X)
    dx_2 = -1 * dx_1.sum(axis=-1, keepdims=True) / last_dim # (N, 1)
    np_grad_inarr = dx_1 + dx_2 # (N, X)
    
    np.testing.assert_allclose(np_grad_bias, grad_bias.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_scale, grad_scale.asnumpy(), rtol=1e-4, atol=1e-4)
    np.testing.assert_allclose(np_grad_inarr, grad_inarr.asnumpy(), rtol=1e-4, atol=1e-4)
    print('Pass backward test with shape ', shape)
Пример #12
0
def test_dense():
    npw = np.random.random((5, 10)).astype(np.float32)
    npx = np.random.random((7, 5)).astype(np.float32)
    cpuctx = ndarray.cpu(0)
    gpuctx = ndarray.gpu(0)

    X = ad.Variable(name="x")
    mid = X + 3
    W = ad.Variable(name='w', value=npw, ctx=cpuctx)
    y = ad.matmul_op(mid, W)
    opt = optimizer.SGDOptimizer(learning_rate=0.1)
    train_op = opt.minimize(y)
    executor = ad.Executor([y, train_op], ctx=gpuctx)
    pred_y, _ = executor.run(feed_dict={X: npx}, convert_to_numpy_ret_vals=True)

    nppred_y = np.matmul((npx + 3), npw)
    np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6)
    new_npw = npw - 0.1 * np.matmul((npx+3).T, np.ones(nppred_y.shape).astype(np.float32))
    np.testing.assert_allclose(W.tensor_value.asnumpy(), new_npw, rtol=1e-10)
Пример #13
0
def test_add_lazy(shape1=(1, 4, 1), shape2=(2, 3, 4, 5), ctx=ndarray.gpu(1)):
    x = np.random.random(shape1).astype(np.float32)
    z = np.random.random(shape2).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_z = ad.Variable(name='z', value=z)
    ath_y = ad.add_op(ad.broadcast_shape_op(ath_x, shape2), ath_z)
    executor = ad.Executor([ath_y], ctx=ctx)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_z = tf.convert_to_tensor(z)
    tf_y = tf_x + tf_z
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    print('Passed add op test with shape ', shape1, shape2)
Пример #14
0
def test_broadcast(shape1=(3, 1), shape2=(2, 3, 4)):
    ctx = ndarray.gpu(1)
    x = np.random.random(shape1).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_y = ad.broadcast_shape_op(ath_x, shape2)
    ath_grad = ad.gradients(ath_y, [ath_x])[0]
    executor = ad.Executor([ath_y, ath_grad], ctx=ctx)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_y = tf.broadcast_to(tf_x, shape2)
    tf_grad = tf.gradients(tf_y, tf_x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y, tf_grad])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape))
    print('Passed broadcast shape op test with shape ', shape1, shape2)
Пример #15
0
def test_transpose(shape=(2, 3, 4, 5), perm=None):
    ctx = ndarray.gpu(1)
    x = np.random.random(shape).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_y = ad.transpose_op(ath_x, perm)
    ath_grad = ad.gradients(ath_y, [ath_x])[0]
    executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_y = tf.transpose(tf_x, perm)
    tf_grad = tf.gradients(tf_y, tf_x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y, tf_grad])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape))
    print('Passed transpose shape op test with shape ', shape, ' and perm ', perm)
Пример #16
0
def test_slice(shape1=(7, 11, 13), shape2=(2, 3, 4), begin_pos=(0, 0, 0)):
    ctx = ndarray.gpu(1)
    x = np.random.random(shape1).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_y = ad.slice_op(ath_x, begin_pos, shape2)
    ath_grad = ad.gradients(ath_y, [ath_x])[0]
    executor = ad.Executor([ath_y, ath_grad], ctx=ctx, enable_lazy=False)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_y = tf.slice(tf_x, begin_pos, shape2)
    tf_grad = tf.gradients(tf_y, tf_x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y, tf_grad])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape))
    print('Passed slice op test with shape ', shape1, shape2, ' and begin pos ', begin_pos)
Пример #17
0
def test_reduce_sum(shape=(2, 3, 4), axes=[2]):
    ctx = ndarray.gpu(1)
    x = np.random.random(shape).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_y = ad.reduce_sum_op(ath_x, axes, keepdims=False)
    ath_grad = ad.gradients(ath_y, [ath_x])[0]
    executor = ad.Executor([ath_y, ath_grad], ctx=ctx)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_y = tf.reduce_sum(tf_x, axes)
    tf_grad = tf.gradients(tf_y, tf_x)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y, tf_grad])
    
    np.testing.assert_allclose(ath_results[0], np.reshape(tf_results[0], ath_results[0].shape), rtol=1e-6)
    np.testing.assert_allclose(ath_results[1], np.reshape(tf_results[1], ath_results[1].shape), rtol=1e-6)
    print('Passed reduce sum op test with shape and axes ', shape, axes)
Пример #18
0
def test_p2p(comm=None, src=0, target=1):
    shape = (1000, 30, 224, 224)
    size = 4
    for val in shape:
        size *= val
    print("MyRank: ", comm.myRank.value)
    arr = np.ones(shape) * comm.localRank.value
    arr = ndarray.array(arr, ctx=ndarray.gpu(comm.localRank.value))
    # arr = ndarray.array(arr, ctx = ndarray.cpu())
    start = time.time()
    if comm.myRank.value == 0:
        comm.dlarraySend(arr, ncclDataType_t.ncclFloat32, 1)
    else:
        comm.dlarrayRecv(arr, ncclDataType_t.ncclFloat32, 0)
    comm.stream.sync()
    end = time.time()

    secs = end - start
    # size: /Bytes
    # dur_time: /s
    return size, secs
Пример #19
0
def test_batch_matmul(shape1=(7, 4, 6), shape2=(7, 6, 5), transA=False, transB=False):
    executor_ctx = ndarray.gpu(1)

    if transA:
        shape1 = tuple(list(shape1)[:-2] + [shape1[-1], shape1[-2]])
    if transB:
        shape2 = tuple(list(shape2)[:-2] + [shape2[-1], shape2[-2]])

    data = np.random.normal(0.0, 0.2, shape1).astype(np.float32)
    weights = np.random.normal(0.0, 0.1, shape2).astype(np.float32)

    ath_data = ad.Variable(name='data')
    ath_weights = ad.Variable(name='weights')
    ath_output = ad.batch_matmul_op(ath_data, ath_weights, trans_A=transA, trans_B=transB)

    ath_grads = ad.gradients(ath_output, [ath_data, ath_weights])

    executor = ad.Executor(
        [ath_output] + ath_grads,
        ctx=executor_ctx
    )

    ath_results = executor.run(feed_dict={ath_data: data, ath_weights: weights})
    ath_results = [res.asnumpy() for res in ath_results]
    
    import tensorflow as tf
    tf_data = tf.placeholder(name='data', dtype=tf.float32)
    tf_weights = tf.placeholder(name='weights', dtype=tf.float32)
    tf_output = tf.matmul(tf_data, tf_weights, transpose_a=transA, transpose_b=transB)
    tf_grads = tf.gradients(tf_output, [tf_data, tf_weights])
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_output] + tf_grads, feed_dict={tf_data: data, tf_weights: weights})
    
    np.testing.assert_allclose(ath_results[0], tf_results[0], atol=1e-6)
    np.testing.assert_allclose(ath_results[1], tf_results[1], atol=1e-6)
    np.testing.assert_allclose(ath_results[2], tf_results[2], atol=1e-6)
    print('Pass batch matmul op test with shape ', shape1, shape2)
Пример #20
0
def test_sparse_array_dense_vector_multiply():
	density = 1e-3
	ctx = ndarray.gpu(0)
	x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = False
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)

	
	x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32)
	y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32)
	mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx)
	arr_y = ndarray.array(y, ctx=ctx)
	arr_z = ndarray.empty((500, 1), ctx=ctx)
	trans = True
	gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z)
	z = arr_z.asnumpy()
	np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
Пример #21
0
def test_sparse():
    npemb = np.random.random((100, 20)).astype(np.float32)
    npind = np.array(np.random.randint(100, size=(10,)))
    npw = np.random.random((20, 30)).astype(np.float32)
    cpuctx = ndarray.cpu(0)
    gpuctx = ndarray.gpu(0)

    embedding = ad.Variable('embeddingtable', value=npemb, ctx=cpuctx)
    index = ad.Variable(name="index", ctx=cpuctx)
    W = ad.Variable(name="w", value=npw)
    y = ad.embedding_lookup_op(embedding, index) # (10, 20)
    y = ad.matmul_op(y, W)
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(y)
    executor = ad.Executor([y, train_op],ctx=gpuctx)

    out, _ = executor.run(feed_dict={index: npind.astype(np.float32)}, convert_to_numpy_ret_vals=True)

    np_out = np.matmul(npemb[npind], npw)
    np.testing.assert_allclose(out, np_out, rtol=1e-6)
    tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T)
    for i, localid in enumerate(npind):
        npemb[localid] -= 0.1 * tmp_grad[i]
    np.testing.assert_allclose(embedding.tensor_value.asnumpy(), npemb, rtol=1e-6)
Пример #22
0
from hetu import ndarray
import numpy as np
# import time

logging.basicConfig(level=logging.INFO)

logging.info("# hparams")
hparams = Hparams()
parser = hparams.parser
hp = parser.parse_args()
print(hp)

logging.info("# Prepare train/eval batches")
dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)

ctx = ndarray.gpu(1)
xs = ad.Variable(name='xs')
ys1 = ad.Variable(name='ys1')
ys2 = ad.Variable(name='ys2')
nonpadding = ad.Variable(name='nonpadding')

logging.info("# Load model")
m = Transformer(hp)
loss = m.train(xs, (ys1, ys2))
loss = ad.div_op(ad.reduce_sum_op(loss * nonpadding, axes=[0, 1]),
                 ad.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7)
opt = optimizer.SGDOptimizer(hp.lr)
train_op = opt.minimize(loss)
executor = ad.Executor([loss, train_op], ctx=ctx)

logging.info("# Session")
Пример #23
0
def worker(model, rank, args):
    def train(iterations):
        train_loss, train_acc, train_auc = [], [], []
        for it in tqdm(range(iterations)):
            loss_val, predict_y, y_val, _ = executor.run(
                convert_to_numpy_ret_vals=True)
            if y_val.shape[1] == 1:  # for criteo case
                acc_val = np.equal(y_val, predict_y > 0.5).astype(np.float)
            else:
                acc_val = np.equal(np.argmax(y_val, 1),
                                   np.argmax(predict_y, 1)).astype(np.float)
            train_loss.append(loss_val[0])
            train_acc.append(acc_val)
            train_auc.append(metrics.roc_auc_score(y_val, predict_y))
        return np.mean(train_loss), np.mean(train_acc), np.mean(train_auc)

    def validate(iterations):
        test_loss, test_acc, test_auc = [], [], []
        for it in range(iterations):
            loss_val, test_y_predicted, y_test_val = val_executor.run(
                convert_to_numpy_ret_vals=True)
            if y_test_val.shape[1] == 1:  # for criteo case
                correct_prediction = np.equal(
                    y_test_val, test_y_predicted > 0.5).astype(np.float)
            else:
                correct_prediction = np.equal(np.argmax(y_test_val, 1),
                                              np.argmax(test_y_predicted,
                                                        1)).astype(np.float)
            test_loss.append(loss_val[0])
            test_acc.append(correct_prediction)
            test_auc.append(metrics.roc_auc_score(y_test_val,
                                                  test_y_predicted))
        return np.mean(test_loss), np.mean(test_acc), np.mean(test_auc)

    from models.load_data import process_all_criteo_data
    dense, sparse, labels = process_all_criteo_data(return_val=args.val)
    loss, prediction, y_, train_op = model(dense, sparse, labels)

    executor = ad.Executor([loss, prediction, y_, train_op], ctx=ndarray.gpu(rank),\
        dataloader_name='train', stream_mode='AllStreams', comm_mode='Hybrid', use_sparse_pull=True, cstable_policy=args.cache, bsp=args.bsp, seed=123, cache_bound=args.bound)
    if args.val:
        val_executor = ad.Executor([loss, prediction, y_], ctx=ndarray.gpu(rank),\
            dataloader_name='validate', stream_mode='AllStreams', comm_mode='Hybrid', use_sparse_pull=True, inference=True, bsp=args.bsp)

    executor.recordLoads()

    raw_log_file = './logs/localhybrid_%s' % (args.model)
    if args.bsp:
        raw_log_file += '_bsp'
    else:
        raw_log_file += '_asp'
    if args.cache:
        raw_log_file += '_%s' % (args.cache)
    raw_log_file += '_%d.log' % (rank)
    print('Processing all data, log to', raw_log_file)
    log_file = open(raw_log_file, 'w')
    total_epoch = 400
    for ep in range(total_epoch):
        # print("iters: %d" % (lp * 1000))
        print("epoch %d" % ep)
        st_time = time.time()
        train_loss, train_acc, train_auc = train(executor.batch_num // 10 +
                                                 (ep % 10 == 9) *
                                                 (executor.batch_num % 10))
        en_time = time.time()
        train_time = en_time - st_time
        executor.recordLoads()
        if args.val:
            executor.ps_comm.BarrierWorker()
            val_loss, val_acc, val_auc = validate(val_executor.batch_num)
            executor.ps_comm.BarrierWorker()
            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, test_loss: %.4f, test_acc: %.4f, test_auc: %.4f, train_time: %.4f"\
                    % (train_loss, train_acc, train_auc, val_loss, val_acc, val_auc, train_time)
            executor.recordLoads()
        else:
            printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\
                    % (train_loss, train_acc, train_auc, train_time)
        print(printstr)
        log_file.write(printstr + '\n')
        log_file.flush()
Пример #24
0
def train_main(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank % args.num_local_worker)
    embedding_width = args.hidden_size
    extract_width = embedding_width * (meta["feature"] - 1)

    y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array(
        convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu()))
    mask_ = ad.Variable(name="mask_")
    gcn1 = GCN(extract_width, hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    index = dl.GNNDataLoaderOp(
        lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()),
        ctx=ndarray.cpu())
    embedding = initializers.random_normal([meta["idx_max"], embedding_width],
                                           stddev=0.1)
    embed = ad.embedding_lookup_op(embedding, index)
    embed = ad.array_reshape_op(embed, (-1, extract_width))
    # embed = ad.reduce_mean_op(embed, axes=1)
    # x = ad.concat_op(x_, embed, axis=1)
    x = gcn1(embed)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    train_loss = loss * mask_
    train_loss = ad.reduce_mean_op(train_loss, [0])
    opt = optimizer.SGDOptimizer(args.learning_rate)
    train_op = opt.minimize(train_loss)
    ad.worker_init()
    distributed.ps_init(rank, nrank)

    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx = 0
    g_sample, mp_val, mask, mask_eval = graphs[idx]
    idx = (idx + 1) % ngraph
    dl.GNNDataLoaderOp.step(g_sample)
    dl.GNNDataLoaderOp.step(g_sample)
    epoch = 0
    nnodes = 0
    executor = ad.Executor([loss, y, train_op],
                           ctx=ctx,
                           comm_mode='PS',
                           use_sparse_pull=False,
                           cstable_policy=args.cache)
    while True:
        g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx]
        idx = (idx + 1) % ngraph
        dl.GNNDataLoaderOp.step(g_sample_nxt)
        feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask}
        loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)

        acc = np.sum((y_predicted == g_sample.y) * mask_eval)
        train_acc = np.sum((y_predicted == g_sample.y) * mask)
        stat.update(acc, mask_eval.sum(),
                    np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum())
        stat.update_train(train_acc, mask.sum(),
                          np.sum(loss_val.asnumpy() * mask) / mask.sum())

        # distributed.ps_get_worker_communicator().BarrierWorker()
        nnodes += mask.sum() + mask_eval.sum()
        if nnodes > meta["partition"]["nodes"][rank]:
            nnodes = 0
            epoch += 1
            if rank == 0:
                stat.print(epoch)
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
Пример #25
0
def transform(graph):
    mp_val = mp_matrix(graph, ndarray.gpu(0))
    return graph, mp_val
Пример #26
0
    group_col = col_procs[rank_col]
    comm_row = row_groups[rank_row]
    comm_col = col_groups[rank_col]
    
    a = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)
    comm_row.dlarrayBroadcast(a, a, ncclDataType_t.ncclFloat32, root = group_row[1])
    print("Broadcast device=%d, a:"%device_id,a.asnumpy()) 
    
    b = ndarray.array(np.array([rank,rank,rank,rank,rank]),ctx=ctx)  
    comm_col.dlarrayBroadcast(b, b, ncclDataType_t.ncclFloat32, root = group_col[1])
    print("Broadcast device=%d, b:"%device_id,b.asnumpy())   

comm, device_id = ad.mpi_nccl_init()
device = comm.device_id.value
rank = comm.localRank.value
size = comm.nRanks.value
ctx = ndarray.gpu(rank)
a = ndarray.array(np.array([1,2,3,4,5]),ctx=ctx)

test_default()

test_broadcast(group = [0,2,4,5,6], root=4)
test_broadcast(group = [1,4,2,7],root=4)
test_allreduce(group = [1,4,2,5])
test_allreduce(group = [0,7,6,2,4])
test_allgather(group = [2,5,3,7])
test_allgather(group = [2,6,1,7,4])

test_group_broadcast()

Пример #27
0
        W_val = W_val.asnumpy()
    loss_val = [val.asnumpy() for val in loss_val]
    
    y_groundtruth = X_val.dot(W_val)
    loss_groundtruth = np.mean(
                -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True)
    Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500
    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)

    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
    
    
test_csrmm_op(ndarray.cpu(0))
test_csrmm_op(ndarray.gpu(1))


def test_csrmv_op(executor_ctx):
    X = ad.Variable(name="X")
    W = ad.Variable(name="W")
    Y = ad.csrmv_op(X, W)
    Y_ = ad.Variable(name="Y_")
    temp = Y + (-1) * Y_
    loss = temp * temp

    grads = ad.gradients(loss, [W, Y])
    
    executor = ad.Executor(
        [loss, grads[0], grads[1]], ctx=executor_ctx)
    
Пример #28
0
def worker(args):
    def validate():
        hits, ndcgs = [], []
        for idx in range(testData.shape[0]):
            start_index = idx * 100
            predictions = val_executor.run(convert_to_numpy_ret_vals=True)
            map_item_score = {testItemInput[start_index + i]: predictions[0][i] for i in range(100)}
            gtItem = testItemInput[start_index]
            # Evaluate top rank list
            ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get)
            hr = getHitRatio(ranklist, gtItem)
            ndcg = getNDCG(ranklist, gtItem)
            hits.append(hr)
            ndcgs.append(ndcg)
        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hr, ndcg
    def get_current_shard(data):
        if args.comm is not None:
            part_size = data.shape[0] // nrank
            start = part_size * rank
            end = start + part_size if rank != nrank - 1 else data.shape[0]
            return data[start:end]
        else:
            return data

    device_id = 0
    if args.comm == 'PS':
        rank = ad.get_worker_communicate().rank()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8
    elif args.comm == 'Hybrid':
        comm, rank = ad.mpi_nccl_init()
        nrank = int(os.environ['DMLC_NUM_WORKER'])
        device_id = rank % 8

    from movielens import getdata
    if args.all:
        trainData, testData = getdata('ml-25m', 'datasets')
        trainUsers = get_current_shard(trainData['user_input'])
        trainItems = get_current_shard(trainData['item_input'])
        trainLabels = get_current_shard(trainData['labels'])
        testData = get_current_shard(testData)
        testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100)
        testItemInput = testData.reshape((-1,))
    else:
        trainData, testData = getdata('ml-25m', 'datasets')
        trainUsers = get_current_shard(trainData['user_input'][:1024000])
        trainItems = get_current_shard(trainData['item_input'][:1024000])
        trainLabels = get_current_shard(trainData['labels'][:1024000])
        testData = get_current_shard(testData[:1470])
        testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100)
        testItemInput = testData.reshape((-1,))

    num_users, num_items = {
        'ml-1m': (6040, 3706),
        'ml-20m': (138493, 26744),
        'ml-25m': (162541, 59047),
    }['ml-25m']
    # assert not args.all or num_users == testData.shape[0]
    batch_size = 1024
    num_negatives = 4
    topK = 10
    user_input = dl.dataloader_op([
        dl.Dataloader(trainUsers, batch_size, 'train'),
        dl.Dataloader(testUserInput, 100, 'validate'),
    ])
    item_input = dl.dataloader_op([
        dl.Dataloader(trainItems, batch_size, 'train'),
        dl.Dataloader(testItemInput, 100, 'validate'),
    ])
    y_ = dl.dataloader_op([
        dl.Dataloader(trainLabels.reshape((-1, 1)), batch_size, 'train'),
    ])

    loss, y, train_op = neural_mf(user_input, item_input, y_, num_users, num_items)

    executor = ad.Executor([loss, train_op], ctx=ndarray.gpu(device_id), dataloader_name='train', \
        comm_mode=args.comm, cstable_policy=args.cache, bsp=args.bsp, cache_bound=args.bound, seed=123)
    val_executor = ad.Executor([y], ctx=ndarray.gpu(device_id), inference=True, dataloader_name='validate', comm_mode=args.comm, bsp=args.bsp)

    path = 'logs/hetulog_%s' % ({None: 'local', 'PS': 'ps', 'Hybrid': 'hybrid'}[args.comm])
    path += '_%d.txt' % rank if args.comm else '.txt'
    log = Logging(path=path)
    epoch = 7
    start = time.time()
    for ep in range(epoch):
        ep_st = time.time()
        log.write('epoch %d' % ep)
        train_loss = []
        for idx in tqdm(range(executor.batch_num)):
            loss_val = executor.run(convert_to_numpy_ret_vals=True)
            train_loss.append(loss_val[0])

            # if idx % 10000 == 0:
            #     hr, ndcg = validate()
            #     printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg)
            #     log.write(printstr)

        tra_loss = np.mean(train_loss)
        ep_en = time.time()

        # validate phase
        if args.val:
            hr, ndcg = validate()
            printstr = "train_loss: %.4f, HR: %.4f, NDCF: %.4f, train_time: %.4f" % (tra_loss, hr, ndcg, ep_en - ep_st)
        else:
            printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st)
        log.write(printstr)
    log.write('all time: %f' % (time.time() - start))
Пример #29
0
def train_hetu(num_epoch):
    ctx = ndarray.gpu(0)

    x_ = ad.Variable(name="x_")
    y_ = ad.Variable(name="y_")
    mask_ = ad.Variable(name="mask_")

    gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1)
    gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1)

    x = gcn1(x_)
    x = gcn2(x)
    W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes))
    B = initializers.zeros(shape=(graph.num_classes,))
    x = ad.matmul_op(x, W)
    y = x + ad.broadcastto_op(B, x)

    loss = ad.softmaxcrossentropy_op(y, y_)
    loss = ad.mul_op(loss, mask_)
    opt = optimizer.AdamOptimizer(0.01)
    train_op = opt.minimize(loss)
    executor = ad.Executor([loss, y, train_op], ctx=ctx)

    def eval():
        start = time.time()
        ad.Dropout.DropoutOp.phase = "eval"
        mp_val = mp_matrix(graph_full, ctx)

        feed_dict = {
            gcn1.mp : mp_val,
            gcn2.mp : mp_val,
            x_ : ndarray.array(graph_full.x, ctx=ctx),
        }
        executor_eval = ad.Executor([y], ctx=ctx)
        y_predicted, = executor_eval.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)
        acc = (y_predicted == graph_full.y)[train_split:].sum()
        print("Test accuracy:", acc/len(y_predicted[train_split:]))
        ad.Dropout.DropoutOp.phase = "training"
    epoch = 0
    nnodes = 0
    batch_size = 1000
    with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler:
        start = time.time()
        while True:
            g_sample, mask = sampler.sample()
            mp_val = mp_matrix(g_sample, ctx)
            #print(time.time() - start)
            feed_dict = {
                gcn1.mp : mp_val,
                gcn2.mp : mp_val,
                mask_ : ndarray.array(mask,ctx=ctx),
                x_ : ndarray.array(g_sample.x, ctx=ctx),
                y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx)
            }
            loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict)
            y_predicted = y_predicted.asnumpy().argmax(axis=1)
            acc = ((y_predicted == g_sample.y) * mask).sum()
            # print(i, "Train loss :", loss_val.asnumpy().mean())
            # print(i, "Train accuracy:", acc/len(y_predicted))
            nnodes += batch_size
            if nnodes > graph_full.num_nodes:
                nnodes = 0
                epoch += 1
                print("Epoch :", epoch, time.time() - start)
                print("Train accuracy:", acc/mask.sum())
                eval()
                start = time.time()
                if epoch >= num_epoch:
                    break
Пример #30
0
def test(args):
    comm, device_id = ad.mpi_nccl_init()
    rank = comm.localRank.value
    size = comm.nRanks.value

    dataset_info = {
        'Reddit': [232965, 602, 41],
        'Proteins': [132534, 602, 8],
        'Arch': [1644228, 602, 10],
        'Products': [2449029, 100, 47]
    }

    node_count, num_features, num_classes = dataset_info[args.dataset]

    hidden_layer_size = 128
    if num_features < 128:
        hidden_layer_size = 64

    replication = args.replication

    node_Count_Self = row_num(node_count, rank // replication,
                              size // replication)
    node_Count_All = node_count

    _, _, row_groups, col_groups = get_proc_groups(size, replication)

    executor_ctx = ndarray.gpu(device_id)

    if size > 1:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data(
            args, size, replication, rank)
    else:
        adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole(
            args)

    adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part),
                                      shape=adj_part.shape,
                                      ctx=executor_ctx)

    # train:val:test=6:2:2
    # Our optimization on distributed GNN algorithm does NOT affect the correctness!
    # Here due to the limitation of current slice_op, data is split continuously.
    # Continuous split is unfriendly for reordered graph data where nodes are already clustered.
    # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy.
    # The better way is to split data randomly!
    train_split, test_split = 0.6, 0.8
    train_node = int(train_split * node_Count_Self)
    test_node = int(test_split * node_Count_Self)

    A = ad.Variable(name="A", trainable=False)
    H = ad.Variable(name="H")
    np.random.seed(123)
    bounds = np.sqrt(6.0 / (num_features + hidden_layer_size))
    W1_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[num_features,
                                     hidden_layer_size]).astype(np.float32)
    W1 = ad.Variable(name="W1", value=W1_val)
    bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size))
    np.random.seed(123)
    W2_val = np.random.uniform(low=-bounds,
                               high=bounds,
                               size=[hidden_layer_size,
                                     num_classes]).astype(np.float32)

    W2 = ad.Variable(name="W2", value=W2_val)
    y_ = ad.Variable(name="y_")

    z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)
    H1 = ad.relu_op(z)
    y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size,
                          replication, device_id, comm,
                          [row_groups, col_groups], True)

    y_train = ad.slice_op(y, (0, 0), (train_node, num_classes))
    label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes))

    y_test = ad.slice_op(y, (test_node, 0),
                         (node_Count_Self - test_node, num_classes))
    label_test = ad.slice_op(y_, (test_node, 0),
                             (node_Count_Self - test_node, num_classes))

    loss = ad.softmaxcrossentropy_op(y_train, label_train)
    loss_test = ad.softmaxcrossentropy_op(y_test, label_test)
    opt = optimizer.AdamOptimizer()
    train_op = opt.minimize(loss)

    executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx)

    feed_dict = {
        A:
        adj_matrix,
        H:
        ndarray.array(input_part, ctx=executor_ctx),
        y_:
        ndarray.array(convert_to_one_hot(label_part, max_val=num_classes),
                      ctx=executor_ctx),
    }

    epoch_num = 100
    epoch_all, epoch_0 = 0, 0

    for i in range(epoch_num):
        epoch_start_time = time.time()
        results = executor.run(feed_dict=feed_dict)
        loss = results[0].asnumpy().sum()
        y_out = results[1]
        loss_test = results[2].asnumpy().sum()
        epoch_end_time = time.time()
        epoch_time = epoch_end_time - epoch_start_time
        epoch_all += epoch_time
        if i == 0:
            epoch_0 = epoch_time

        print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" %
              (i, rank, epoch_time, epoch_all))

        y_out_train, y_predict = y_out.asnumpy().argmax(
            axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:]
        label_train, label_test = label_part[:train_node], label_part[
            test_node:]
        train_acc = ndarray.array(np.array([(y_out_train == label_train).sum()
                                            ]),
                                  ctx=executor_ctx)
        test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]),
                                 ctx=executor_ctx)
        train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx)
        test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx)

        if replication > 1:
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_acc, test_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                test_loss, test_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_acc, train_acc, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
            col_groups[rank % replication].dlarrayNcclAllReduce(
                train_loss, train_loss, ncclDataType_t.ncclFloat32,
                ncclRedOp_t.ncclSum)
        else:
            comm.dlarrayNcclAllReduce(test_acc, test_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(test_loss, test_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_acc, train_acc,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)
            comm.dlarrayNcclAllReduce(train_loss, train_loss,
                                      ncclDataType_t.ncclFloat32,
                                      ncclRedOp_t.ncclSum)

        test_acc = float(
            test_acc.asnumpy()[0]) / (node_count - test_split * node_count)
        test_loss = test_loss.asnumpy()[0] / (node_count -
                                              test_split * node_count)
        train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count)
        train_loss = train_loss.asnumpy()[0] / (train_split * node_count)

        if rank == 0:
            print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\
            %(i,train_loss, train_acc, test_loss, test_acc))

    avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1)
    results = ndarray.array(np.array([epoch_all, avg_epoch_time]),
                            ctx=executor_ctx)
    comm.dlarrayNcclAllReduce(results,
                              results,
                              ncclDataType_t.ncclFloat32,
                              reduceop=ncclRedOp_t.ncclSum)
    results = results.asnumpy() / size

    if rank == 0:
        print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" %
              (results[0], results[1]))