def dfm_criteo(dense_input, sparse_input, y_): feature_dimension = 33762577 embedding_size = 128 learning_rate = 0.01 # FM Embedding1 = init.random_normal([feature_dimension, 1], stddev=0.01, name="fst_order_embedding", ctx=ndarray.cpu(0)) FM_W = init.random_normal([13, 1], stddev=0.01, name="dense_parameter") sparse_1dim_input = ad.embedding_lookup_op(Embedding1, sparse_input, ctx=ndarray.cpu(0)) fm_dense_part = ad.matmul_op(dense_input, FM_W) fm_sparse_part = ad.reduce_sum_op(sparse_1dim_input, axes=1) """ fst order output""" y1 = fm_dense_part + fm_sparse_part Embedding2 = init.random_normal([feature_dimension, embedding_size], stddev=0.01, name="snd_order_embedding", ctx=ndarray.cpu(0)) sparse_2dim_input = ad.embedding_lookup_op(Embedding2, sparse_input, ctx=ndarray.cpu(0)) sparse_2dim_sum = ad.reduce_sum_op(sparse_2dim_input, axes=1) sparse_2dim_sum_square = ad.mul_op(sparse_2dim_sum, sparse_2dim_sum) sparse_2dim_square = ad.mul_op(sparse_2dim_input, sparse_2dim_input) sparse_2dim_square_sum = ad.reduce_sum_op(sparse_2dim_square, axes=1) sparse_2dim = sparse_2dim_sum_square + -1 * sparse_2dim_square_sum sparse_2dim_half = sparse_2dim * 0.5 """snd order output""" y2 = ad.reduce_sum_op(sparse_2dim_half, axes=1, keepdims=True) #DNN flatten = ad.array_reshape_op(sparse_2dim_input, (-1, 26 * embedding_size)) W1 = init.random_normal([26 * embedding_size, 256], stddev=0.01, name="W1") W2 = init.random_normal([256, 256], stddev=0.01, name="W2") W3 = init.random_normal([256, 1], stddev=0.01, name="W3") fc1 = ad.matmul_op(flatten, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) y3 = ad.matmul_op(relu2, W3) y4 = y1 + y2 y = y4 + y3 y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, y_, train_op
def neural_mf(user_input, item_input, y_, num_users, num_items): batch_size = 256 embed_dim = 8 layers = [64, 32, 16, 8] learning_rate = 0.01 User_Embedding = init.random_normal( (num_users, embed_dim + layers[0] // 2), stddev=0.01, name="user_embed", ctx=ndarray.cpu(0)) Item_Embedding = init.random_normal( (num_items, embed_dim + layers[0] // 2), stddev=0.01, name="item_embed", ctx=ndarray.cpu(0)) # MLP_User_Embedding = init.random_normal((num_users, layers[0] // 2), stddev=0.01, name="mlp_user_embed", ctx=ndarray.cpu(0)) # MLP_Item_Embedding = init.random_normal((num_items, layers[0] // 2), stddev=0.01, name="mlp_item_embed", ctx=ndarray.cpu(0)) user_latent = ad.embedding_lookup_op(User_Embedding, user_input, ctx=ndarray.cpu(0)) item_latent = ad.embedding_lookup_op(Item_Embedding, item_input, ctx=ndarray.cpu(0)) mf_user_latent = ad.slice_op(user_latent, (0, 0), (-1, embed_dim)) mlp_user_latent = ad.slice_op(user_latent, (0, embed_dim), (-1, -1)) mf_item_latent = ad.slice_op(item_latent, (0, 0), (-1, embed_dim)) mlp_item_latent = ad.slice_op(item_latent, (0, embed_dim), (-1, -1)) # mf_user_latent = ad.embedding_lookup_op(MF_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mf_item_latent = ad.embedding_lookup_op(MF_Item_Embedding, item_input, ctx=ndarray.cpu(0)) # mlp_user_latent = ad.embedding_lookup_op(MLP_User_Embedding, user_input, ctx=ndarray.cpu(0)) # mlp_item_latent = ad.embedding_lookup_op(MLP_Item_Embedding, item_input, ctx=ndarray.cpu(0)) W1 = init.random_normal((layers[0], layers[1]), stddev=0.1, name='W1') W2 = init.random_normal((layers[1], layers[2]), stddev=0.1, name='W2') W3 = init.random_normal((layers[2], layers[3]), stddev=0.1, name='W3') W4 = init.random_normal((embed_dim + layers[3], 1), stddev=0.1, name='W4') mf_vector = ad.mul_op(mf_user_latent, mf_item_latent) mlp_vector = ad.concat_op(mlp_user_latent, mlp_item_latent, axis=1) fc1 = ad.matmul_op(mlp_vector, W1) relu1 = ad.relu_op(fc1) fc2 = ad.matmul_op(relu1, W2) relu2 = ad.relu_op(fc2) fc3 = ad.matmul_op(relu2, W3) relu3 = ad.relu_op(fc3) concat_vector = ad.concat_op(mf_vector, relu3, axis=1) y = ad.matmul_op(concat_vector, W4) y = ad.sigmoid_op(y) loss = ad.binarycrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(learning_rate=learning_rate) # opt = optimizer.AdamOptimizer(learning_rate=learning_rate) train_op = opt.minimize(loss) return loss, y, train_op
def train_hetu(args): with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = int(os.environ["WORKER_ID"]) nrank = int(os.environ["DMLC_NUM_WORKER"]) ctx = ndarray.gpu(rank) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(meta["feature"], hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, meta["class"])) B = initializers.zeros(shape=(meta["class"],)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) loss = ad.reduce_mean_op(loss, [0]) opt = optimizer.SGDOptimizer(0.1) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx, comm_mode='PS') distributed.ps_init(rank, nrank) batch_size = 4000 with DistributedGraphSageSampler(args.path, batch_size, 2, 2, rank=rank, nrank=nrank) as sampler: epoch = 0 nnodes = 0 start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ndarray.gpu(rank)) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask, ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=g_sample.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() distributed.ps_get_worker_communicator().BarrierWorker() nnodes += batch_size if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) start = time.time() if epoch >= num_epoch: break
def cross_layer(x0, x1): # x0: input embedding feature (batch_size, 26 * embedding_size + 13) # x1: the output of last layer (batch_size, 26 * embedding_size + 13) embedding_len = 26 * 128 + 13 weight = init.random_normal(shape=(embedding_len, 1), stddev=0.01, name='weight') bias = init.random_normal(shape=(embedding_len, ), stddev=0.01, name='bias') x1w = ad.matmul_op(x1, weight) #(batch_size, 1) y = ad.mul_op(x0, ad.broadcastto_op(x1w, x0)) y = y + x1 + ad.broadcastto_op(bias, y) return y
def train_hetu(num_epoch): ctx = ndarray.gpu(0) x_ = ad.Variable(name="x_") y_ = ad.Variable(name="y_") mask_ = ad.Variable(name="mask_") gcn1 = GraphSage(graph.num_features, hidden_layer_size, activation="relu", dropout=0.1) gcn2 = GraphSage(2*hidden_layer_size, hidden_layer_size, activation="relu", dropout=0.1) x = gcn1(x_) x = gcn2(x) W = initializers.xavier_uniform(shape=(2*hidden_layer_size, graph.num_classes)) B = initializers.zeros(shape=(graph.num_classes,)) x = ad.matmul_op(x, W) y = x + ad.broadcastto_op(B, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.mul_op(loss, mask_) opt = optimizer.AdamOptimizer(0.01) train_op = opt.minimize(loss) executor = ad.Executor([loss, y, train_op], ctx=ctx) def eval(): start = time.time() ad.Dropout.DropoutOp.phase = "eval" mp_val = mp_matrix(graph_full, ctx) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, x_ : ndarray.array(graph_full.x, ctx=ctx), } executor_eval = ad.Executor([y], ctx=ctx) y_predicted, = executor_eval.run(feed_dict=feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = (y_predicted == graph_full.y)[train_split:].sum() print("Test accuracy:", acc/len(y_predicted[train_split:])) ad.Dropout.DropoutOp.phase = "training" epoch = 0 nnodes = 0 batch_size = 1000 with GraphSageSampler(graph, batch_size, depth=2, num_sample_thread=4) as sampler: start = time.time() while True: g_sample, mask = sampler.sample() mp_val = mp_matrix(g_sample, ctx) #print(time.time() - start) feed_dict = { gcn1.mp : mp_val, gcn2.mp : mp_val, mask_ : ndarray.array(mask,ctx=ctx), x_ : ndarray.array(g_sample.x, ctx=ctx), y_ : ndarray.array(convert_to_one_hot(g_sample.y, max_val=graph.num_classes), ctx=ctx) } loss_val, y_predicted, _ = executor.run(feed_dict = feed_dict) y_predicted = y_predicted.asnumpy().argmax(axis=1) acc = ((y_predicted == g_sample.y) * mask).sum() # print(i, "Train loss :", loss_val.asnumpy().mean()) # print(i, "Train accuracy:", acc/len(y_predicted)) nnodes += batch_size if nnodes > graph_full.num_nodes: nnodes = 0 epoch += 1 print("Epoch :", epoch, time.time() - start) print("Train accuracy:", acc/mask.sum()) eval() start = time.time() if epoch >= num_epoch: break
def lstm(x, y_): ''' LSTM model, for MNIST dataset. Parameters: x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) Return: loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) ''' print("Building LSTM model...") diminput = 28 dimhidden = 128 dimoutput = 10 nsteps = 28 forget_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_forget_gate_w") forget_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_forget_gate_u") forget_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_forget_gate_b") input_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_input_gate_w") input_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_input_gate_u") input_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_input_gate_b") output_gate_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_output_gate_w") output_gate_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_output_gate_u") output_gate_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_output_gate_b") tanh_w = init.random_normal(shape=(diminput, dimhidden), stddev=0.1, name="lstm_tanh_w") tanh_u = init.random_normal(shape=(dimhidden, dimhidden), stddev=0.1, name="lstm_tanh_u") tanh_b = init.random_normal(shape=(dimhidden, ), stddev=0.1, name="lstm_tanh_b") out_weights = init.random_normal(shape=(dimhidden, dimoutput), stddev=0.1, name="lstm_out_weight") out_bias = init.random_normal(shape=(dimoutput, ), stddev=0.1, name="lstm_out_bias") initial_state = ad.Variable(value=np.zeros((1, )).astype(np.float32), name='initial_state', trainable=False) for i in range(nsteps): cur_x = ad.slice_op(x, (0, i * diminput), (-1, diminput)) # forget gate if i == 0: temp = ad.matmul_op(cur_x, forget_gate_w) last_c_state = ad.broadcastto_op(initial_state, temp) last_h_state = ad.broadcastto_op(initial_state, temp) cur_forget = ad.matmul_op(last_h_state, forget_gate_u) + temp else: cur_forget = ad.matmul_op(last_h_state, forget_gate_u) + ad.matmul_op( cur_x, forget_gate_w) cur_forget = cur_forget + ad.broadcastto_op(forget_gate_b, cur_forget) cur_forget = ad.sigmoid_op(cur_forget) # input gate cur_input = ad.matmul_op(last_h_state, input_gate_u) + ad.matmul_op( cur_x, input_gate_w) cur_input = cur_input + ad.broadcastto_op(input_gate_b, cur_input) cur_input = ad.sigmoid_op(cur_input) # output gate cur_output = ad.matmul_op(last_h_state, output_gate_u) + ad.matmul_op( cur_x, output_gate_w) cur_output = cur_output + ad.broadcastto_op(output_gate_b, cur_output) cur_output = ad.sigmoid_op(cur_output) # tanh cur_tanh = ad.matmul_op(last_h_state, tanh_u) + ad.matmul_op( cur_x, tanh_w) cur_tanh = cur_tanh + ad.broadcastto_op(tanh_b, cur_tanh) cur_tanh = ad.tanh_op(cur_tanh) last_c_state = ad.mul_op(last_c_state, cur_forget) + ad.mul_op( cur_input, cur_tanh) last_h_state = ad.tanh_op(last_c_state) * cur_output x = ad.matmul_op(last_h_state, out_weights) y = x + ad.broadcastto_op(out_bias, x) loss = ad.softmaxcrossentropy_op(y, y_) loss = ad.reduce_mean_op(loss, [0]) return loss, y