示例#1
0
def dfm_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01

    # FM
    Embedding1 = init.random_normal([feature_dimension, 1],
                                    stddev=0.01,
                                    name="fst_order_embedding",
                                    ctx=ndarray.cpu(0))
    FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter")
    sparse_1dim_input = ad.embedding_lookup_op(Embedding1,
                                               sparse_input,
                                               ctx=ndarray.cpu(0))
    fm_dense_part = ad.matmul_op(dense_input, FM_W)
    fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1)
    """ fst order output"""
    y1 = fm_dense_part + fm_sparse_part

    Embedding2 = init.random_normal([feature_dimension, embedding_size],
                                    stddev=0.01,
                                    name="snd_order_embedding",
                                    ctx=ndarray.cpu(0))
    sparse_2dim_input = ad.embedding_lookup_op(Embedding2,
                                               sparse_input,
                                               ctx=ndarray.cpu(0))
    sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1)
    sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum)

    sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input)
    sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1)
    sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum
    sparse_2dim_half = sparse_2dim * 0.5
    """snd order output"""
    y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True)

    #DNN
    flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size))
    W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 1], stddev=0.01, name="W3")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = y1 + y2
    y = y4 + y3
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
示例#2
0
文件: hetu_ncf.py 项目: sj1104/Het
def neural_mf(user_input, item_input, y_, num_users, num_items):
    batch_size = 256
    embed_dim = 8
    layers = [64, 32, 16, 8]
    learning_rate = 0.01

    User_Embedding = init.random_normal(
        (num_users, embed_dim + layers[0] // 2),
        stddev=0.01,
        name="user_embed",
        ctx=ndarray.cpu(0))
    Item_Embedding = init.random_normal(
        (num_items, embed_dim + layers[0] // 2),
        stddev=0.01,
        name="item_embed",
        ctx=ndarray.cpu(0))
    # MLP_User_Embedding = init.random_normal((num_users, layers[0] // 2), stddev=0.01, name="mlp_user_embed", ctx=ndarray.cpu(0))
    # MLP_Item_Embedding = init.random_normal((num_items, layers[0] // 2), stddev=0.01, name="mlp_item_embed", ctx=ndarray.cpu(0))

    user_latent = ad.embedding_lookup_op(User_Embedding,
                                         user_input,
                                         ctx=ndarray.cpu(0))
    item_latent = ad.embedding_lookup_op(Item_Embedding,
                                         item_input,
                                         ctx=ndarray.cpu(0))

    mf_user_latent = ad.slice_op(user_latent, (0, 0), (-1, embed_dim))
    mlp_user_latent = ad.slice_op(user_latent, (0, embed_dim), (-1, -1))
    mf_item_latent = ad.slice_op(item_latent, (0, 0), (-1, embed_dim))
    mlp_item_latent = ad.slice_op(item_latent, (0, embed_dim), (-1, -1))

    # mf_user_latent = ad.embedding_lookup_op(MF_User_Embedding, user_input, ctx=ndarray.cpu(0))
    # mf_item_latent = ad.embedding_lookup_op(MF_Item_Embedding, item_input, ctx=ndarray.cpu(0))
    # mlp_user_latent = ad.embedding_lookup_op(MLP_User_Embedding, user_input, ctx=ndarray.cpu(0))
    # mlp_item_latent = ad.embedding_lookup_op(MLP_Item_Embedding, item_input, ctx=ndarray.cpu(0))

    W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1')
    W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2')
    W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3')
    W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4')

    mf_vector = ad.mul_op(mf_user_latent, mf_item_latent)
    mlp_vector = ad.concat_op(mlp_user_latent, mlp_item_latent, axis=1)
    fc1 = ad.matmul_op(mlp_vector, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    fc3 = ad.matmul_op(relu2, W3)
    relu3 = ad.relu_op(fc3)
    concat_vector = ad.concat_op(mf_vector, relu3, axis=1)
    y = ad.matmul_op(concat_vector, W4)
    y = ad.sigmoid_op(y)
    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    # opt = optimizer.AdamOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)
    return loss, y, train_op
示例#3
0
def wdl_criteo(dense, sparse, labels):
    batch_size = 128
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.01
    if isinstance(dense, tuple):
        dense_input = dl.dataloader_op([[dense[0], batch_size, 'train'],
                                        [dense[1], batch_size, 'validate']])
        sparse_input = dl.dataloader_op([[sparse[0], batch_size, 'train'],
                                         [sparse[1], batch_size, 'validate']])
        y_ = dl.dataloader_op([[labels[0], batch_size, 'train'],
                               [labels[1], batch_size, 'validate']])
    else:
        dense_input = dl.dataloader_op([[dense, batch_size, 'train']])
        sparse_input = dl.dataloader_op([[sparse, batch_size, 'train']])
        y_ = dl.dataloader_op([[labels, batch_size, 'train']])
    print("Data loaded.")
    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding",
                                   ctx=ndarray.cpu(0))
    sparse_input = ad.embedding_lookup_op(Embedding,
                                          sparse_input,
                                          ctx=ndarray.cpu(0))
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))

    #DNN
    flatten = dense_input
    W1 = init.random_normal([13, 256], stddev=0.01, name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

    W4 = init.random_normal([256 + 26 * embedding_size, 1],
                            stddev=0.01,
                            name="W4")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = ad.concat_op(sparse_input, y3, axis=1)
    y = ad.matmul_op(y4, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
示例#4
0
def test_cpu_truncated_normal(size, mean=0, std=1):
    cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0))
    np_st = time()
    for i in range(10):
        x = truncnorm.rvs(-2.0, 2.0, loc=mean, scale=std,
                          size=size).astype(np.float32)
        cpu_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cpu_st = time()
    for i in range(10):
        cpu_op.truncated_normal_init(cpu_x, mean, std, 123)
    cpu_en = time()
    print('cpu time: ', cpu_en - cpu_st)
    fig, ax = plt.subplots(1, 1)
    cpu_x = cpu_x.asnumpy()
    assert (cpu_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'truncated_normal_%f_%f.png' % (mean, std)
    plt.savefig(file_name)
    plt.close()
示例#5
0
def test_cpu_uniform(size, lb=-1, ub=1):
    cpu_x = ndarray.empty(size, ctx=ndarray.cpu(0))
    np_st = time()
    for i in range(10):
        x = np.random.uniform(low=lb, high=ub, size=size).astype(np.float32)
        cpu_x[:] = x
    np_en = time()
    print('numpy time: ', np_en - np_st)
    cpu_st = time()
    for i in range(10):
        cpu_op.uniform_init(cpu_x, lb, ub, 123)
    cpu_en = time()
    print('cpu time: ', cpu_en - cpu_st)
    fig, ax = plt.subplots(1, 1)
    cpu_x = cpu_x.asnumpy()
    assert (cpu_x.shape == x.shape)
    ax.hist(x.flatten(),
            histtype='stepfilled',
            alpha=0.2,
            bins=50,
            label='numpy')
    ax.hist(cpu_x.flatten(), histtype='step', alpha=0.2, bins=50, label='cpu')
    ax.legend(loc='best', frameon=False)
    # ax2.legend(loc='best', frameon=False)
    file_name = 'uniform_%f_%f_cpu.png' % (lb, ub)
    plt.savefig(file_name)
    plt.close()
示例#6
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    nitem = 2000
    item_len = 1000
    arr = ndarray.array(np.random.rand(nitem, item_len),
                        ctx=ctx)  # generate a long buffer

    push_indices = np.arange(nitem) * nrank + rank
    print(push_indices)
    push_length = np.repeat(item_len, repeats=nitem)
    worker_communicate = ad.get_worker_communicate()
    worker_communicate.PushData(pointer(push_indices), nitem, arr.handle,
                                pointer(push_length))
    print("Waiting")
    worker_communicate.WaitPushData(pointer(push_indices), nitem)
    worker_communicate.BarrierWorker()
    print("OK")
    arr2 = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
    worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle,
                                pointer(push_length))
    worker_communicate.WaitPullData(pointer(push_indices), nitem)
    assert np.all(arr.asnumpy() == arr2.asnumpy())
    print("Check Complete")
示例#7
0
def dcn_criteo(dense_input, sparse_input, y_):
    feature_dimension = 33762577
    embedding_size = 128
    learning_rate = 0.003

    Embedding = init.random_normal([feature_dimension, embedding_size],
                                   stddev=0.01,
                                   name="snd_order_embedding",
                                   ctx=ndarray.cpu(0))
    sparse_input = ad.embedding_lookup_op(Embedding,
                                          sparse_input,
                                          ctx=ndarray.cpu(0))
    sparse_input = ad.array_reshape_op(sparse_input, (-1, 26 * embedding_size))
    x = ad.concat_op(sparse_input, dense_input, axis=1)
    # Cross Network
    cross_output = build_cross_layer(x, num_layers=3)

    #DNN
    flatten = x
    W1 = init.random_normal([26 * embedding_size + 13, 256],
                            stddev=0.01,
                            name="W1")
    W2 = init.random_normal([256, 256], stddev=0.01, name="W2")
    W3 = init.random_normal([256, 256], stddev=0.01, name="W3")

    W4 = init.random_normal([256 + 26 * embedding_size + 13, 1],
                            stddev=0.01,
                            name="W4")

    fc1 = ad.matmul_op(flatten, W1)
    relu1 = ad.relu_op(fc1)
    fc2 = ad.matmul_op(relu1, W2)
    relu2 = ad.relu_op(fc2)
    y3 = ad.matmul_op(relu2, W3)

    y4 = ad.concat_op(cross_output, y3, axis=1)
    y = ad.matmul_op(y4, W4)
    y = ad.sigmoid_op(y)

    loss = ad.binarycrossentropy_op(y, y_)
    loss = ad.reduce_mean_op(loss, [0])
    opt = optimizer.SGDOptimizer(learning_rate=learning_rate)
    train_op = opt.minimize(loss)

    return loss, y, y_, train_op
示例#8
0
    def sync_and_clear(self):
        self.count += 1
        train_stat = ndarray.array(self.train_stat, ndarray.cpu())
        test_stat = ndarray.array(self.test_stat, ndarray.cpu())
        comm.dlarrayNcclAllReduce(train_stat, train_stat,
                                  ncclDataType_t.ncclFloat32,
                                  ncclRedOp_t.ncclSum, comm.stream)
        comm.dlarrayNcclAllReduce(test_stat, test_stat,
                                  ncclDataType_t.ncclFloat32,
                                  ncclRedOp_t.ncclSum, comm.stream)
        comm.stream.sync()
        train_stat, test_stat = train_stat.asnumpy(), test_stat.asnumpy()
        printstr = "epoch {}: test loss: {:.3f} test acc: {:.3f} train loss: {:.3f} train acc: {:.3f}".format(
            self.count,
            test_stat[3] / test_stat[0],
            test_stat[1] / test_stat[2],
            train_stat[3] / train_stat[0],
            train_stat[1] / train_stat[2],
        )
        logstr = "{} {} {} {}".format(
            test_stat[3] / test_stat[0],
            test_stat[1] / test_stat[2],
            train_stat[3] / train_stat[0],
            train_stat[1] / train_stat[2],
        )
        self.time.append(time.time())
        if comm.device_id.value == 0:
            print(printstr, flush=True)
            print(logstr, file=self.file, flush=True)
            if len(self.time) > 3:
                epoch_time = np.array(self.time[1:]) - np.array(self.time[:-1])
                print("epoch time: {:.3f}+-{:.3f}".format(
                    np.mean(epoch_time), np.var(epoch_time)))

        self.train_stat[:] = 0
        self.test_stat[:] = 0
示例#9
0
def test_dense():
    npw = np.random.random((5, 10)).astype(np.float32)
    npx = np.random.random((7, 5)).astype(np.float32)
    cpuctx = ndarray.cpu(0)
    gpuctx = ndarray.gpu(0)

    X = ad.Variable(name="x")
    mid = X + 3
    W = ad.Variable(name='w', value=npw, ctx=cpuctx)
    y = ad.matmul_op(mid, W)
    opt = optimizer.SGDOptimizer(learning_rate=0.1)
    train_op = opt.minimize(y)
    executor = ad.Executor([y, train_op], ctx=gpuctx)
    pred_y, _ = executor.run(feed_dict={X: npx}, convert_to_numpy_ret_vals=True)

    nppred_y = np.matmul((npx + 3), npw)
    np.testing.assert_allclose(pred_y, nppred_y, rtol=1e-6)
    new_npw = npw - 0.1 * np.matmul((npx+3).T, np.ones(nppred_y.shape).astype(np.float32))
    np.testing.assert_allclose(W.tensor_value.asnumpy(), new_npw, rtol=1e-10)
示例#10
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    if rank > 0:
        return
    arr = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx) # generate a long buffer

    push_indices = np.arange(nitem)
    print(push_indices)
    push_length = np.repeat(item_len, repeats=nitem)
    worker_communicate = ad.get_worker_communicate()
    query = worker_communicate.PushData(pointer(push_indices), nitem, arr.handle, pointer(push_length))
    worker_communicate.WaitData(query)
    print("data_pushed")
    t = ThreadPoolExecutor(max_workers=max_thread)
    byte_count = 0
    arr2 = ndarray.array(np.random.rand(nitem, item_len),ctx = ctx)
    def pull_data():
        query = worker_communicate.PullData(pointer(push_indices), nitem, arr2.handle, pointer(push_length))
        worker_communicate.WaitData(query)
        # print( np.all(arr.asnumpy() == arr2.asnumpy()) )
        nonlocal byte_count
        byte_count += nitem * item_len * 4
    def watch():
        nonlocal byte_count
        start = time.time()
        while True:
            time.sleep(1)
            speed = byte_count / (time.time() - start)
            print("speed : {} MB/s".format(speed / 2**20))
    task_list = [None for i in range(max_thread)]
    threading.Thread(target=watch).start()
    while True:
        for i in range(max_thread):
            if task_list[i] is None or task_list[i].done():
                task_list[i] = t.submit(pull_data)
示例#11
0
def test_sparse():
    npemb = np.random.random((100, 20)).astype(np.float32)
    npind = np.array(np.random.randint(100, size=(10,)))
    npw = np.random.random((20, 30)).astype(np.float32)
    cpuctx = ndarray.cpu(0)
    gpuctx = ndarray.gpu(0)

    embedding = ad.Variable('embeddingtable', value=npemb, ctx=cpuctx)
    index = ad.Variable(name="index", ctx=cpuctx)
    W = ad.Variable(name="w", value=npw)
    y = ad.embedding_lookup_op(embedding, index) # (10, 20)
    y = ad.matmul_op(y, W)
    opt = optimizer.SGDOptimizer(0.1)
    train_op = opt.minimize(y)
    executor = ad.Executor([y, train_op],ctx=gpuctx)

    out, _ = executor.run(feed_dict={index: npind.astype(np.float32)}, convert_to_numpy_ret_vals=True)

    np_out = np.matmul(npemb[npind], npw)
    np.testing.assert_allclose(out, np_out, rtol=1e-6)
    tmp_grad = np.matmul(np.ones(np_out.shape).astype(np.float32), npw.T)
    for i, localid in enumerate(npind):
        npemb[localid] -= 0.1 * tmp_grad[i]
    np.testing.assert_allclose(embedding.tensor_value.asnumpy(), npemb, rtol=1e-6)
示例#12
0
def test():
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    arr = ndarray.array(np.random.rand(2,rank+100),ctx = ctx)
    print(arr.asnumpy())

    push_indices = np.array([2*rank+1,2*rank+2])

    if rank == 0:
        pull_indices = np.array([3])
    elif rank == 1:
        pull_indices = np.array([1])

    push_length = np.array([rank+100,rank+100])


    if rank == 0:
        pull_length = np.array([101])
        out_arr = ndarray.array(np.zeros(101),ctx = ctx)
    elif rank == 1:
        pull_length = np.array([100])
        out_arr = ndarray.array(np.zeros(100),ctx = ctx)

    print(out_arr.asnumpy())

    worker_communicate = ad.get_worker_communicate()
    query = worker_communicate.PushData(pointer(push_indices), 2, arr.handle, pointer(push_length))

    worker_communicate.WaitData(query);

    worker_communicate.BarrierWorker()
    worker_communicate.PullData(pointer(pull_indices), 1, out_arr.handle, pointer(pull_length))
    worker_communicate.WaitData(query);

    print(out_arr.asnumpy())
示例#13
0
def test(func_name,
         nitem=2000,
         item_len=10000,
         ind_len=500,
         max_thread=10,
         ret_ans=False):
    func_name = func_name.lower()
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])

    comm = ad.get_worker_communicate()
    byte_count = 0
    if func_name == 'pushnpull':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Push(name, inarr.handle, None)
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4 * 2
    elif func_name == 'pushpull':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.DDPushPull(name, inarr.handle, outarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4 * 2
    elif func_name == 'sparsepushnpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            np_ind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_ind.astype(np.float32), ctx=ctx)
            uni_ind_len = np.unique(np_ind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (nitem + uni_ind_len) * item_len * 4
    elif func_name == 'sparsepushnsparsepull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            uni_outind_len = np.unique(np_outind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.SparsePull(name, outind.handle, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
    elif func_name == 'push':
        inarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Push(name, inarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4
    elif func_name == 'pull':
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            comm.Pull(name, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += nitem * item_len * 4
    elif func_name == 'sparsepush':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            comm.SparsePush(name, inind.handle, inarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += uni_inind_len * item_len * 4
    elif func_name == 'sparsepull':
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_outind_len = np.unique(np_outind).size
            comm.SparsePull(name, outind.handle, outarr.handle)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += uni_outind_len * item_len * 4
    elif func_name == 'sdpushpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(nitem, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            comm.SDPushPull(name, inind.handle, inarr.handle, outarr.handle,
                            None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + nitem) * item_len * 4
    elif func_name == 'sspushpull':
        inarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)
        outarr = ndarray.array(np.random.rand(ind_len, item_len), ctx=ctx)

        def func(name):
            np_inind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            np_outind = np.random.randint(low=0, high=nitem, size=(ind_len, ))
            inind = ndarray.array(np_inind.astype(np.float32), ctx=ctx)
            uni_inind_len = np.unique(np_inind).size
            outind = ndarray.array(np_outind.astype(np.float32), ctx=ctx)
            uni_outind_len = np.unique(np_outind).size
            comm.SSPushPull(name, inind.handle, inarr.handle, outind.handle,
                            outarr.handle, None)
            comm.Wait(name)
            nonlocal byte_count
            byte_count += (uni_inind_len + uni_outind_len) * item_len * 4
    else:
        assert False
    if 'sparse' in func_name or func_name in ('sdpushpull', 'sspushpull'):
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
        sparse_init = ctypes.c_int(1)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
        sparse_init = ctypes.c_int(0)
    for i in range(max_thread):
        comm.InitTensor(i, sparse_init, arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0), ctypes.c_double(1), ctypes.c_ulonglong(123),\
            ctypes.c_int(0), (ctypes.c_float * 1)(0.1), ctypes.c_int(1))
    # print("data init")
    t = ThreadPoolExecutor(max_workers=max_thread)
    if ret_ans:
        task_list = [None for i in range(max_thread)]
        for i in range(max_thread):
            task_list[i] = t.submit(func, i)
        curByte = byte_count
        start = time.time()
        cnt = 0
        while cnt < 30:
            for i in range(max_thread):
                if task_list[i].done():
                    cnt += 1
                    task_list[i] = t.submit(func, i)
        speed = (byte_count - curByte) / (time.time() - start) / 2**20
        t.shutdown()
        for i in range(max_thread):
            comm.ClearOnServer(i)
            comm.Clear(i)
        return speed
    else:

        def watch():
            start = time.time()
            while True:
                time.sleep(1)
                speed = byte_count / (time.time() - start)
                print("speed : {} MB/s".format(speed / 2**20))

        task_list = [None for i in range(max_thread)]
        threading.Thread(target=watch).start()
        while True:
            for i in range(max_thread):
                if task_list[i] is None or task_list[i].done():
                    task_list[i] = t.submit(func, i)
示例#14
0
import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

import tf2onnx

import argparse
import six.moves.cPickle as pickle
import gzip
import os
import pdb
import ctypes
import time
batch_size = 128

# ctx=ndarray.gpu(0)
ctx = ndarray.cpu(0)


def load_mnist_data(dataset):
    """ Load the dataset
    Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py

    :type dataset: string
    :param dataset: the path to the dataset (here MNIST)
    """
    # Download the MNIST dataset if it is not present
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the data directory.
        new_path = os.path.join(os.path.split(__file__)[0], dataset)
        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
示例#15
0
def test_init_ps(rarr, init_type, init_a, init_b=1.0, sparse=False):
    assert init_type in ('constant', 'uniform', 'normal', 'truncated_normal')
    init_type_map = {
        'constant': 0,
        'uniform': 1,
        'normal': 2,
        'truncated_normal': 3
    }
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem, item_len)
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    itype = ctypes.c_int(init_type_map[init_type])
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len,
                    arr_wid, itype, ctypes.c_double(init_a),
                    ctypes.c_double(init_b), ctypes.c_ulonglong(123),
                    ctypes.c_int(0), (ctypes.c_float * 1)(0.1),
                    ctypes.c_int(1))

    comm.Pull(ctypes.c_int(0), arr.handle)
    comm.Wait(ctypes.c_int(0))
    if rank == 0:
        local_arr[:] = arr.asnumpy()
    comm.BarrierWorker()
    if rank != 0:
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
    else:
        if init_type == 'constant':
            np.testing.assert_allclose(np.full((nitem, item_len), init_a),
                                       arr.asnumpy(),
                                       rtol=5e-7)
        else:
            if init_type == 'uniform':
                numpy_samples = np.random.uniform(
                    low=init_a, high=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            elif init_type == 'normal':
                numpy_samples = np.random.normal(
                    loc=init_a, scale=init_b,
                    size=(nitem, item_len)).astype(np.float32)
            else:
                numpy_samples = truncnorm.rvs(-2.0,
                                              2.0,
                                              loc=init_a,
                                              scale=init_b,
                                              size=(nitem, item_len)).astype(
                                                  np.float32)
            fig, ax = plt.subplots(1, 1)
            ax.hist(numpy_samples.flatten(),
                    histtype='stepfilled',
                    alpha=0.2,
                    bins=50,
                    label='numpy')
            ax.hist(local_arr.flatten(),
                    histtype='step',
                    alpha=0.2,
                    bins=50,
                    label='ps')
            ax.legend(loc='best', frameon=False)
            # ax2.legend(loc='best', frameon=False)
            file_name = '%s_%.1f_%.1f_%d.png' % (init_type, init_a, init_b,
                                                 int(sparse))
            plt.savefig(file_name)
            print('Check file %s.' % file_name)
    print('Init parameters %d/%d passed.' % (rank, nrank))
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()
示例#16
0
def test_api(rarr, rpush, rpull, sparse=False, lr=0.5):
    ctx = ndarray.cpu(0)
    rank = int(os.environ["WORKER_ID"])
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    local_arr = np.frombuffer(rarr, dtype=np.float32).reshape(nitem,
                                                              item_len).copy()
    local_push = np.frombuffer(rpush, dtype=np.float32).copy()
    local_pull = np.frombuffer(rpull, dtype=np.float32).copy()
    if rank == 0:
        arr = ndarray.array(local_arr, ctx=ctx)
    else:
        arr = ndarray.empty((nitem, item_len), ctx=ctx)
    comm = ad.get_worker_communicate()
    if sparse:
        arr_len = ctypes.c_int(nitem)
        arr_wid = ctypes.c_int(item_len)
    else:
        arr_len = ctypes.c_int(nitem * item_len)
        arr_wid = ctypes.c_int(1)
    comm.InitTensor(ctypes.c_int(0), ctypes.c_int(sparse), arr_len, arr_wid, ctypes.c_int(0), ctypes.c_double(0.0), ctypes.c_double(1.0), ctypes.c_ulonglong(123),\
        ctypes.c_int(0), (ctypes.c_float * 1)(lr), ctypes.c_int(1))
    if sparse:
        local_arr[:] = 0
        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SparsePush(0, push_ind.handle, push_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SparsePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SDPushPull(0, push_ind.handle, push_val.handle, arr.handle,
                            None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('SDPushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

        for j in local_push:
            local_arr[int(j)] += 1
        pull_ind = ndarray.array(local_pull.reshape(indx1, indx2), ctx=ctx)
        pull_val = ndarray.empty((indx1, indx2, item_len), ctx=ctx)
        if rank == 0:
            push_ind = ndarray.array(local_push.reshape(indx1, indx2), ctx=ctx)
            push_val = ndarray.array(np.ones(
                (indx1, indx2, item_len)).astype(np.float32),
                                     ctx=ctx)
            comm.SSPushPull(0, push_ind.handle, push_val.handle, \
                        pull_ind.handle, pull_val.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.SparsePull(0, pull_ind.handle, pull_val.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr[local_pull.astype(int)].reshape(
            indx1, indx2, item_len),
                                   pull_val.asnumpy(),
                                   rtol=5e-7)
        print('SSPushPull and SparsePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()

    else:
        if rank == 0:
            comm.Push(0, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        comm.Pull(0, arr.handle)
        comm.Wait(0)
        np.testing.assert_allclose(local_arr, arr.asnumpy(), rtol=5e-7)
        print('DensePush DensePull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
        if rank == 0:
            temp_push_val = ndarray.array(np.ones(
                (nitem, item_len)).astype(np.float32),
                                          ctx=ctx)
            comm.DDPushPull(0, temp_push_val.handle, arr.handle, None)
            comm.Wait(0)
        comm.BarrierWorker()
        if rank != 0:
            comm.Pull(0, arr.handle)
            comm.Wait(0)
        np.testing.assert_allclose(local_arr + 1, arr.asnumpy())
        print('DenseDensePushPull %d/%d passed.' % (rank, nrank))
        comm.BarrierWorker()
    if rank == 0:
        comm.ClearOnServer(0)
    comm.Clear(0)
    comm.BarrierWorker()
示例#17
0
def train_main(args):
    with open(os.path.join(args.path, "meta.yml"), 'rb') as f:
        meta = yaml.load(f.read(), Loader=yaml.FullLoader)
    hidden_layer_size = args.hidden_size
    num_epoch = args.num_epoch
    rank = ad.get_worker_communicate().rank()
    nrank = int(os.environ["DMLC_NUM_WORKER"])
    ctx = ndarray.gpu(rank % args.num_local_worker)
    embedding_width = args.hidden_size
    extract_width = embedding_width * (meta["feature"] - 1)

    y_ = dl.GNNDataLoaderOp(lambda g: ndarray.array(
        convert_to_one_hot(g.y, max_val=g.num_classes), ctx=ndarray.cpu()))
    mask_ = ad.Variable(name="mask_")
    gcn1 = GCN(extract_width, hidden_layer_size, activation="relu")
    gcn2 = GCN(hidden_layer_size, meta["class"])
    index = dl.GNNDataLoaderOp(
        lambda g: ndarray.array(g.x[:, 0:-1], ctx=ndarray.cpu()),
        ctx=ndarray.cpu())
    embedding = initializers.random_normal([meta["idx_max"], embedding_width],
                                           stddev=0.1)
    embed = ad.embedding_lookup_op(embedding, index)
    embed = ad.array_reshape_op(embed, (-1, extract_width))
    # embed = ad.reduce_mean_op(embed, axes=1)
    # x = ad.concat_op(x_, embed, axis=1)
    x = gcn1(embed)
    y = gcn2(x)
    loss = ad.softmaxcrossentropy_op(y, y_)
    train_loss = loss * mask_
    train_loss = ad.reduce_mean_op(train_loss, [0])
    opt = optimizer.SGDOptimizer(args.learning_rate)
    train_op = opt.minimize(train_loss)
    ad.worker_init()
    distributed.ps_init(rank, nrank)

    ngraph = meta["partition"]["nodes"][rank] // args.batch_size
    graphs = prepare_data(ngraph)
    idx = 0
    g_sample, mp_val, mask, mask_eval = graphs[idx]
    idx = (idx + 1) % ngraph
    dl.GNNDataLoaderOp.step(g_sample)
    dl.GNNDataLoaderOp.step(g_sample)
    epoch = 0
    nnodes = 0
    executor = ad.Executor([loss, y, train_op],
                           ctx=ctx,
                           comm_mode='PS',
                           use_sparse_pull=False,
                           cstable_policy=args.cache)
    while True:
        g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt = graphs[idx]
        idx = (idx + 1) % ngraph
        dl.GNNDataLoaderOp.step(g_sample_nxt)
        feed_dict = {gcn1.mp: mp_val, gcn2.mp: mp_val, mask_: mask}
        loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict)
        y_predicted = y_predicted.asnumpy().argmax(axis=1)

        acc = np.sum((y_predicted == g_sample.y) * mask_eval)
        train_acc = np.sum((y_predicted == g_sample.y) * mask)
        stat.update(acc, mask_eval.sum(),
                    np.sum(loss_val.asnumpy() * mask_eval) / mask_eval.sum())
        stat.update_train(train_acc, mask.sum(),
                          np.sum(loss_val.asnumpy() * mask) / mask.sum())

        # distributed.ps_get_worker_communicator().BarrierWorker()
        nnodes += mask.sum() + mask_eval.sum()
        if nnodes > meta["partition"]["nodes"][rank]:
            nnodes = 0
            epoch += 1
            if rank == 0:
                stat.print(epoch)
            if epoch >= num_epoch:
                break
        g_sample, mp_val, mask, mask_eval = g_sample_nxt, mp_val_nxt, mask_nxt, mask_eval_nxt
示例#18
0
    if ndarray.is_gpu_ctx(executor_ctx):
        W_val = W_val.asnumpy()
    loss_val = [val.asnumpy() for val in loss_val]
    
    y_groundtruth = X_val.dot(W_val)
    loss_groundtruth = np.mean(
                -np.sum(Y_val * np.log(softmax_func(y_groundtruth)), axis=1), keepdims=True)
    Y_grad_groundtruth = (softmax_func(y_groundtruth) + -1 * Y_val) * np.ones(loss_groundtruth.shape) / 500
    W_grad_groundtruth = X_val.T.dot(Y_grad_groundtruth)

    np.testing.assert_allclose(loss_val[0], loss_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[1], W_grad_groundtruth, rtol=1e-4)
    np.testing.assert_allclose(loss_val[2], Y_grad_groundtruth, rtol=1e-4)
    
    
test_csrmm_op(ndarray.cpu(0))
test_csrmm_op(ndarray.gpu(1))


def test_csrmv_op(executor_ctx):
    X = ad.Variable(name="X")
    W = ad.Variable(name="W")
    Y = ad.csrmv_op(X, W)
    Y_ = ad.Variable(name="Y_")
    temp = Y + (-1) * Y_
    loss = temp * temp

    grads = ad.gradients(loss, [W, Y])
    
    executor = ad.Executor(
        [loss, grads[0], grads[1]], ctx=executor_ctx)
示例#19
0
    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)
    tf_z = tf.convert_to_tensor(z)
    tf_y = tf_x + tf_z
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tf_results = sess.run([tf_y])
    
    np.testing.assert_allclose(ath_results[0], tf_results[0])
    print('Passed add op test with shape ', shape)

test_add()
test_add((7, 9))
test_add((4, 5, 6, 7, 8))
test_add(ctx=ndarray.cpu(0))
test_add((7, 9), ctx=ndarray.cpu(0))
test_add((4, 5, 6, 7, 8), ctx=ndarray.cpu(0))


def test_add_broadcast(shape1=(2, 3, 4, 5), shape2=(1, 4, 1), ctx=ndarray.gpu(1)):
    x = np.random.random(shape1).astype(np.float32)
    z = np.random.random(shape2).astype(np.float32)
    ath_x = ad.Variable(name='x', value=x)
    ath_z = ad.Variable(name='z', value=z)
    ath_y = ad.add_op(ath_x, ath_z)
    executor = ad.Executor([ath_y], ctx=ctx, enable_lazy=False)
    ath_results = [var.asnumpy() for var in executor.run()]

    import tensorflow as tf
    tf_x = tf.convert_to_tensor(x)