def mp_matrix(graph, device, system="Hetu", use_original_gcn_norm=False): norm = graph.gcn_norm(use_original_gcn_norm) if system == "Hetu": from hetu import ndarray mp_mat = ndarray.sparse_array(values=norm, indices=(graph.edge_index[1], graph.edge_index[0]), shape=(graph.num_nodes, graph.num_nodes), ctx=device) return mp_mat elif system == "Pytorch": import torch indices = np.vstack((graph.edge_index[1], graph.edge_index[0])) mp_mat = torch.sparse.FloatTensor(indices=torch.LongTensor(indices), values=torch.FloatTensor(norm), size=(graph.num_nodes, graph.num_nodes)) return mp_mat.to(device) elif system == "tensorflow": import tensorflow as tf indices = np.vstack((graph.edge_index[1], graph.edge_index[0])).T shape = np.array([graph.num_nodes, graph.num_nodes], dtype=np.int64) mp_val = tf.compat.v1.SparseTensorValue(indices, norm, shape) return mp_val else: raise NotImplementedError
def test_sparse_matrix_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 7000,density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(7000, 100)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 7000], ctx=ctx) mat_y = ndarray.array(y, ctx=ctx) mat_z = ndarray.empty((500, 100), ctx=ctx) gpu_op.CuSparse_Csrmm(mat_x, False, mat_y, False, mat_z) z = mat_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5)
def test_sparse_array_dense_vector_multiply(): density = 1e-3 ctx = ndarray.gpu(0) x = scipy.sparse.rand(500, 70000, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [500, 70000], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = False gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.dot(y), z, rtol=1e-5) x = scipy.sparse.rand(70000, 500, density=density,format='coo',dtype=np.float32) y = np.random.uniform(0, 10, size=(70000, 1)).astype(np.float32) mat_x = ndarray.sparse_array(x.data, (x.row, x.col), shape = [70000, 500], ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1), ctx=ctx) trans = True gpu_op.CuSparse_Csrmv(mat_x, trans, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x.transpose().dot(y), z, rtol=1e-5)
def test(args): comm, device_id = ad.mpi_nccl_init() rank = comm.localRank.value size = comm.nRanks.value dataset_info = { 'Reddit': [232965, 602, 41], 'Proteins': [132534, 602, 8], 'Arch': [1644228, 602, 10], 'Products': [2449029, 100, 47] } node_count, num_features, num_classes = dataset_info[args.dataset] hidden_layer_size = 128 if num_features < 128: hidden_layer_size = 64 replication = args.replication node_Count_Self = row_num(node_count, rank // replication, size // replication) node_Count_All = node_count _, _, row_groups, col_groups = get_proc_groups(size, replication) executor_ctx = ndarray.gpu(device_id) if size > 1: adj_part, data_part, row_part, col_part, input_part, label_part = load_data( args, size, replication, rank) else: adj_part, data_part, row_part, col_part, input_part, label_part = load_data_whole( args) adj_matrix = ndarray.sparse_array(data_part, (row_part, col_part), shape=adj_part.shape, ctx=executor_ctx) # train:val:test=6:2:2 # Our optimization on distributed GNN algorithm does NOT affect the correctness! # Here due to the limitation of current slice_op, data is split continuously. # Continuous split is unfriendly for reordered graph data where nodes are already clustered. # Specifically, training on some node clusters and testing on other clusters may cause poor test accuracy. # The better way is to split data randomly! train_split, test_split = 0.6, 0.8 train_node = int(train_split * node_Count_Self) test_node = int(test_split * node_Count_Self) A = ad.Variable(name="A", trainable=False) H = ad.Variable(name="H") np.random.seed(123) bounds = np.sqrt(6.0 / (num_features + hidden_layer_size)) W1_val = np.random.uniform(low=-bounds, high=bounds, size=[num_features, hidden_layer_size]).astype(np.float32) W1 = ad.Variable(name="W1", value=W1_val) bounds = np.sqrt(6.0 / (num_classes + hidden_layer_size)) np.random.seed(123) W2_val = np.random.uniform(low=-bounds, high=bounds, size=[hidden_layer_size, num_classes]).astype(np.float32) W2 = ad.Variable(name="W2", value=W2_val) y_ = ad.Variable(name="y_") z = ad.distgcn_15d_op(A, H, W1, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) H1 = ad.relu_op(z) y = ad.distgcn_15d_op(A, H1, W2, node_Count_Self, node_Count_All, size, replication, device_id, comm, [row_groups, col_groups], True) y_train = ad.slice_op(y, (0, 0), (train_node, num_classes)) label_train = ad.slice_op(y_, (0, 0), (train_node, num_classes)) y_test = ad.slice_op(y, (test_node, 0), (node_Count_Self - test_node, num_classes)) label_test = ad.slice_op(y_, (test_node, 0), (node_Count_Self - test_node, num_classes)) loss = ad.softmaxcrossentropy_op(y_train, label_train) loss_test = ad.softmaxcrossentropy_op(y_test, label_test) opt = optimizer.AdamOptimizer() train_op = opt.minimize(loss) executor = ad.Executor([loss, y, loss_test, train_op], ctx=executor_ctx) feed_dict = { A: adj_matrix, H: ndarray.array(input_part, ctx=executor_ctx), y_: ndarray.array(convert_to_one_hot(label_part, max_val=num_classes), ctx=executor_ctx), } epoch_num = 100 epoch_all, epoch_0 = 0, 0 for i in range(epoch_num): epoch_start_time = time.time() results = executor.run(feed_dict=feed_dict) loss = results[0].asnumpy().sum() y_out = results[1] loss_test = results[2].asnumpy().sum() epoch_end_time = time.time() epoch_time = epoch_end_time - epoch_start_time epoch_all += epoch_time if i == 0: epoch_0 = epoch_time print("[Epoch: %d, Rank: %d] Epoch time: %.3f, Total time: %.3f" % (i, rank, epoch_time, epoch_all)) y_out_train, y_predict = y_out.asnumpy().argmax( axis=1)[:train_node], y_out.asnumpy().argmax(axis=1)[test_node:] label_train, label_test = label_part[:train_node], label_part[ test_node:] train_acc = ndarray.array(np.array([(y_out_train == label_train).sum() ]), ctx=executor_ctx) test_acc = ndarray.array(np.array([(y_predict == label_test).sum()]), ctx=executor_ctx) train_loss = ndarray.array(np.array([loss]), ctx=executor_ctx) test_loss = ndarray.array(np.array([loss_test]), ctx=executor_ctx) if replication > 1: col_groups[rank % replication].dlarrayNcclAllReduce( test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) col_groups[rank % replication].dlarrayNcclAllReduce( train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) else: comm.dlarrayNcclAllReduce(test_acc, test_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(test_loss, test_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_acc, train_acc, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) comm.dlarrayNcclAllReduce(train_loss, train_loss, ncclDataType_t.ncclFloat32, ncclRedOp_t.ncclSum) test_acc = float( test_acc.asnumpy()[0]) / (node_count - test_split * node_count) test_loss = test_loss.asnumpy()[0] / (node_count - test_split * node_count) train_acc = float(train_acc.asnumpy()[0]) / (train_split * node_count) train_loss = train_loss.asnumpy()[0] / (train_split * node_count) if rank == 0: print("[Epoch: %d] Train Loss: %.3f, Train Accuracy: %.3f, Test Loss: %.3f, Test Accuracy: %.3f"\ %(i,train_loss, train_acc, test_loss, test_acc)) avg_epoch_time = (epoch_all - epoch_0) / (epoch_num - 1) results = ndarray.array(np.array([epoch_all, avg_epoch_time]), ctx=executor_ctx) comm.dlarrayNcclAllReduce(results, results, ncclDataType_t.ncclFloat32, reduceop=ncclRedOp_t.ncclSum) results = results.asnumpy() / size if rank == 0: print("\nAverage Total Time: %.3f, Average Epoch Time: %.3f" % (results[0], results[1]))
def train_hetu(num_epoch): ctx = ndarray.gpu(0) feed_dict = {} nparts = 4 graph.add_self_loop() norm = graph.gcn_norm(True) graphs, edge_list, reindexed_edges = graph.part_graph(nparts) x_val = np.concatenate(list(map(lambda g: g.x, graphs))) y_concat = np.concatenate(list(map(lambda g: g.y, graphs))) y_val = convert_to_one_hot( y_concat, max_val=graph.num_classes) # shape=(n, num_classes) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") feed_dict[x_] = ndarray.array(x_val, ctx=ctx) feed_dict[y_] = ndarray.array(y_val, ctx=ctx) gcn1 = PCGCN(graph.num_features, 16, npart=nparts) gcn2 = PCGCN(16, graph.num_classes, npart=nparts) mp_val = [[None for j in range(nparts)] for i in range(nparts)] use_sparse = [True for g in graphs] for i in range(nparts): for j in range(nparts): if i == j: edges = graphs[i].edge_index else: edges = pick_edges(reindexed_edges, edge_list[i][j]) if i == j and use_sparse[i] == False: mp_val[i][j] = sparse.csr_matrix( (norm[edge_list[i][j]], (edges[1], edges[0])), shape=(graphs[j].num_nodes, graphs[i].num_nodes)).toarray() else: mp_val[i][j] = ndarray.sparse_array( values=norm[edge_list[i][j]], indices=(edges[1], edges[0]), shape=(graphs[j].num_nodes, graphs[i].num_nodes), ctx=ctx) feed_dict[gcn1.mp[i][j]] = mp_val[i][j] feed_dict[gcn2.mp[i][j]] = mp_val[i][j] subgraph_size = list(map(lambda g: g.num_nodes, graphs)) x = gcn1(x_, subgraph_size=subgraph_size, use_sparse=use_sparse) x = ad.relu_op(x) y = gcn2(x, subgraph_size=subgraph_size, use_sparse=use_sparse) # y_train = ad.slice_op(y, (0, 0), (train_split, graph.num_classes)) # loss = ad.softmaxcrossentropy_op(y_train, y_) loss = ad.softmaxcrossentropy_op(y, y_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) losses = [] for i in range(num_epoch): loss_val, y_predicted, _ = executor.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == y_concat).sum() losses.append(loss_val.asnumpy()[0]) if i == 0: start_time = time.time() print("Train loss :", loss_val.asnumpy().mean()) print("Val accuracy:", acc / len(y_predicted)) print("Hetu time:", (time.time() - start_time) / 199) return losses