def train(args): _, A, _ = load_data(path=args.path, dataset=args.dataset) scaled_A = A / A.sum(axis=1) size = args.size K = args.Kstep assert size % K == 0 dim = int(size / K) t1 = time.time() A_k = np.identity(scaled_A.shape[0]) Rep = np.zeros((scaled_A.shape[0], size)) for i in range(K): print("K:", i) A_k = np.dot(A_k, scaled_A) prob_trans = np.log(A_k / np.tile(np.sum(A_k, axis=0), (scaled_A.shape[0], 1))) - np.log( 1.0 / scaled_A.shape[0]) prob_trans[prob_trans < 0] = 0 prob_trans[prob_trans == np.nan] = 0 U, S, VT = la.svd(prob_trans) Ud = U[:, 0:dim] Sd = S[0:dim] R_k = np.array(Ud) * np.power(Sd, 0.5).reshape(dim) R_k = normalize(R_k, axis=1, norm='l2') Rep[:, dim * i:dim * (i + 1)] = R_k[:, :] print("done.., cost: {}s".format(time.time() - t1)) np.save(args.output + ".npy", np.asarray(Rep, dtype=np.float32)) print("saved.")
def train_py(args): _, A, _ = load_data(path=args.path, dataset=args.dataset) scaled_A = A / A.sum(axis=1) size = args.size K = args.Kstep assert size % K == 0 dim = int(size / K) t1 = time.time() A_k = np.identity(scaled_A.shape[0]) Rep = np.zeros((scaled_A.shape[0], size)) scaled_A = torch.FloatTensor(scaled_A).cuda() A_k = torch.FloatTensor(A_k).cuda() Rep = torch.FloatTensor(Rep).cuda() for i in range(K): print("K:", i) A_k = torch.dot(A_k, scaled_A) prob_trans = torch.log(A_k / torch.sum(A_k, dim=0).repeat( scaled_A.shape[0], 1)) - torch.log(1.0 / scaled_A.shape[0]) prob_trans[prob_trans < 0] = 0 prob_trans[prob_trans == np.nan] = 0 U, S, VT = torch.svd(prob_trans) Ud = U[:, 0:dim] Sd = S[0:dim] R_k = Ud * torch.pow(Sd, 0.5).view(dim) R_k = F.normalize(R_k, p=2, dim=1) Rep[:, dim * i:dim * (i + 1)] = R_k[:, :] print("done.., cost: {}s".format(time.time() - t1)) np.save(args.output + ".npy", Rep.cpu().numpy()) print("saved.")
def train(args): _, A, _ = load_data(path=args.path, dataset=args.dataset) row, col = A.nonzero() edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)), axis=1).astype(dtype=np.dtype(str)) print("build") t1 = time.time() G = {} for [i, j] in edges: if i not in G: G[i] = [] if j not in G: G[j] = [] G[i].append(j) G[j].append(i) for node in G: G[node] = list(sorted(set(G[node]))) if node in G[node]: G[node].remove(node) nodes = list(sorted(G.keys())) print("len(G.keys()):", len(G.keys()), "\tnode_num:", A.shape[0]) corpus = [] for cnt in range(args.number_walks): random.shuffle(nodes) for idx, node in enumerate(nodes): path = [node] while len(path) < args.walk_length: cur = path[-1] if len(G[cur]) > 0: if random.random() >= args.alpha: path.append(random.choice(G[cur])) else: path.append(path[0]) else: break corpus.append(path) t2 = time.time() print("cost: {}s".format(t2 - t1)) print("train...") model = Word2Vec(corpus, size=args.size, window=args.window, min_count=0, sg=1, hs=1, workers=args.workers) print("done.., cost: {}s".format(time.time() - t2)) output = [] for i in range(A.shape[0]): if str(i) in model.wv: output.append(model.wv[str(i)]) else: output.append(np.zeros(args.size)) np.save(args.output + ".npy", np.asarray(output, dtype=np.float32)) print("saved.")
def deepWalk(_windowSize=5, _embeddingSize=128, _walkLength=35): X, A, y = data_utils_cora.load_data(dataset='cora') graph = Graph(A) walk = [] vector = list(graph.vector) #build corpus from random walks for vect in range(0, len(vector)): walk.append(graph.randomWalk(_walkLength, vect)) #set hyperparameters - word2vec utilises the skipgram algorithm to create word embeddings model = Word2Vec(walk, size=_embeddingSize, window=_windowSize, min_count=0, sg=1, hs=1, workers=4) G = Graph.sparse(A) y = np.ravel(np.array([np.where(y[i] == 1)[0] for i in range(y.shape[0])])) X = np.array([model.wv[str(i)] for i in range(len(G))]) features = np.asarray([model[str(X)] for X in range(len(Graph.sparse(A)))]) y_train, y_val, y_test, idx_train, idx_val, idx_test = data_utils_cora.get_splits( y) x_train, x_val, x_test, idx_train, idx_val, idx_test = data_utils_cora.get_splits( X) test = LogisticRegression(max_iter=500, multi_class='ovr').fit(features[idx_train], y[idx_train].ravel()) test.fit(x_train, y_train) print(test.score(x_train, y_train)) print(test.score(x_test, y_test))
def train(args): _, A, _ = load_data(path=args.path, dataset=args.dataset) row, col = A.nonzero() edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)), axis=1) edge_sampler = AliasSampling(probs=A.data / np.sum(A.data)) node_weights = np.power(np.asarray(A.sum(axis=0)).flatten(), 0.75) node_sampler = AliasSampling(probs=node_weights / np.sum(node_weights)) learning_rate = args.rho line = Line(A.shape[0], args.size) optimizer = optim.Adadelta(line.parameters(), lr=learning_rate) if args.gpu and torch.cuda.is_available(): line.cuda() sampling_time, training_time = 0, 0 line.train() for i in range(args.batch_num): t1 = time.time() u_i, u_j, label = get_batch(A, edges=edges, edge_sampler=edge_sampler, node_sampler=node_sampler, batch_size=args.batch_size, negative=args.negative) t2 = time.time() sampling_time += t2 - t1 if args.gpu and torch.cuda.is_available(): u_i, u_j, label = Variable(u_i.cuda()), Variable( u_j.cuda()), Variable(label.cuda()) else: u_i, u_j, label = Variable(u_i), Variable(u_j), Variable(label) if i % 100 == 0 and i != 0: print('Batch_no: {:06d}'.format(i), 'loss: {:.4f}'.format(loss.data[0]), 'rho: {:.4f}'.format(learning_rate), 'sampling_time: {:.4f}'.format(sampling_time), 'training_time: {:.4f}'.format(training_time)) sampling_time, training_time = 0, 0 else: optimizer.zero_grad() loss = line(u_i, u_j, label) # loss = F.kl_div(output, label) # print("__loss: {:.4f}".format(loss.data[0])) loss.backward() # print("line.embeddings.weight.grad:", np.max(np.array(line.embeddings.weight.grad.data))) # if line.order == 2: # print("line.context_embedding.weight.grad:", np.max(np.array(line.context_embedding.weight.grad.data))) optimizer.step() training_time += time.time() - t2 if learning_rate > args.rho * 1e-4: learning_rate = args.rho * (1 - i / args.batch_num) else: learning_rate = args.rho * 1e-4 optimizer = optim.Adadelta(line.parameters(), lr=learning_rate) print("done..") if args.gpu and torch.cuda.is_available(): np.save(args.output + "_" + str(args.order) + ".npy", F.normalize(line.embeddings.cpu().weight).data.numpy()) else: np.save(args.output + "_" + str(args.order) + ".npy", F.normalize(line.embeddings.weight).data.numpy()) print("saved.")
np.random.seed(2018) torch.manual_seed(2018) class LogisticRegression(nn.Module): def __init__(self, input_size, num_classes): super(LogisticRegression, self).__init__() self.linear = nn.Linear(input_size, num_classes) def forward(self, x): out = self.linear(x) # out = F.relu(out) return F.log_softmax(out, dim=1) X, A, y = load_data(dataset='cora') y_train, y_val, y_test, idx_train, idx_val, idx_test = get_splits('cora', y) METHOD = "line" PARA = "_4_0.25" if METHOD == "node2vec" else "" if METHOD == "line": PARA = "_2" # embeddings = np.genfromtxt("workspace/vec_2nd_wo_norm10000.txt", skip_header=1, dtype=np.float32)[:, 1:] # from sklearn.preprocessing import normalize # embeddings = normalize(embeddings, axis=1) embeddings = np.load("workspace/" + METHOD + "_embedding_cora" + PARA + ".npy") # embeddings = np.load("workspace/line.tensorflow_7w.npy") # print(embeddings[0:5])
def train(args): _, A, _ = load_data(path=args.path, dataset=args.dataset) row, col = A.nonzero() edges = np.concatenate((row.reshape(-1, 1), col.reshape(-1, 1)), axis=1).astype(dtype=np.dtype(str)) print("build") t1 = time.time() G, node_samplers, edge_samplers = {}, {}, {} for [i, j] in edges: if i not in G: G[i] = [] if j not in G: G[j] = [] G[i].append(j) G[j].append(i) for node in G: G[node] = list(sorted(set(G[node]))) if node in G[node]: G[node].remove(node) node_samplers[node] = alias_sampler(probs=A[int(node), :].data / np.sum(A[int(node), :].data)) for [i, j] in edges: edge_weights = [] for j_nbr in G[j]: if j_nbr == i: edge_weights.append(A[int(j), int(j_nbr)] / args.p) elif A[int(j_nbr), int(i)] >= 1e-4: edge_weights.append(A[int(j), int(j_nbr)]) else: edge_weights.append(A[int(j), int(j_nbr)] / args.q) edge_weights = np.asarray(edge_weights, dtype=np.float32) edge_samplers[i + "-" + j] = alias_sampler(probs=edge_weights / edge_weights.sum()) nodes = list(sorted(G.keys())) print("len(G.keys()):", len(G.keys()), "\tnode_num:", A.shape[0]) corpus = [] for cnt in range(args.number_walks): random.shuffle(nodes) for idx, node in enumerate(nodes): path = [node] while len(path) < args.walk_length: cur = path[-1] if len(G[cur]) > 0: if len(path) == 1: path.append(G[cur][sampling(node_samplers[cur][0], node_samplers[cur][1])]) else: prev = path[-2] path.append(G[cur][sampling( edge_samplers[prev + "-" + cur][0], edge_samplers[prev + "-" + cur][1])]) else: break corpus.append(path) t2 = time.time() print("cost: {}s".format(t2 - t1)) print("train...") model = Word2Vec(corpus, size=args.size, window=args.window, min_count=0, sg=1, workers=args.workers) print("done.., cost: {}s".format(time.time() - t2)) output = [] for i in range(A.shape[0]): if str(i) in model.wv: output.append(model.wv[str(i)]) else: output.append(np.zeros(args.size)) np.save(args.output + "_" + str(args.p) + "_" + str(args.q) + ".npy", np.asarray(output, dtype=np.float32)) print("saved.")