def evaluate_embeddings(embeddings): X, Y = read_node_label('../data/wiki/wiki_labels.txt') tr_frac = 0.8 print("Training classifier using {:.2f}% nodes...".format( tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def classify(vectors, args): if not os.path.isfile(args.classifydir + '_labels.txt'): return defaultdict(lambda: 0) X, Y = read_node_label(args.classifydir + '_labels.txt') # print("Training classifier using {:.2f}% nodes...".format(args.train_percent * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression(solver="lbfgs", max_iter=4000)) # scores = clf.split_train_evaluate(X, Y, args.train_percent) features, labels, graph, idx_train, idx_val, idx_test = load_dataset( str(args.classifydir.split("/")[-1])) # print(idx_train) # print(type(idx_train)) idx_train = list(idx_train) # idx_val = list(idx_val) # idx_val += list(idx_test)[:600] idx_test = list(idx_test) #[600:] # for i in idx_val: # idx_train.append(i) # idx_val = idx_val[400:] print("TRAINING SIZE", len(idx_train), "VALIDATION SIZE", len(idx_val), "TESTING SIZE: ", len(list(idx_test))) scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_val) # scores = clf.split_train_evaluate(X, Y, args.train_percent) test_scores = clf.split_train_evaluate_idx(X, Y, idx_train, idx_test) test_x.append(test_scores['macro']) print("micro:", test_scores['micro'], "macro:", test_scores['macro']) return scores
def main(args): t1 = time.time() g = Graph() print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = tadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def output(self, task): X = self.embedding_mat node_num = self.node_num if task == 'class': Y = read_node_label(self.label_path, node_num) eval(X, Y) else: link_prediction(X, test_path)
def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file = None, clf_ratio = 0.5, auto_stop = True): self.rep_size = rep_size self.order = order self.best_result = 0 self.vectors = {} if order == 3: self.model1 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=1) self.model2 = _LINE(graph, rep_size/2, batch_size, negative_ratio, order=2) for i in range(epoch): self.model1.train_one_epoch() self.model2.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['micro'] < self.best_result and auto_stop: self.vectors = self.last_vectors print 'Auto stop!' return elif result['micro'] > self.best_result: self.best_result = result['micro'] else: self.model = _LINE(graph, rep_size, batch_size, negative_ratio, order=self.order) for i in range(epoch): self.model.train_one_epoch() if label_file: self.get_embeddings() X, Y = read_node_label(label_file) # print "Training classifier using {:.2f}% nodes...".format(clf_ratio*100) clf = Classifier(vectors=self.vectors, clf=LogisticRegression()) result = clf.split_train_evaluate(X, Y, clf_ratio) if result['micro'] < self.best_result and auto_stop: self.vectors = self.last_vectors print 'Auto stop!' return elif result['micro'] > self.best_result: self.best_result = result['micro'] self.get_embeddings()
def classify(vectors, args): if not os.path.isfile(args.classifydir + '_labels.txt'): return defaultdict(lambda: 0) X, Y = read_node_label(args.classifydir + '_labels.txt') print("Training classifier using {:.2f}% nodes...".format( args.train_percent * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression(solver="lbfgs", max_iter=4000)) scores = clf.split_train_evaluate(X, Y, args.train_percent) return scores
def main(args): node_embeddings = load_embeddings(args.embedding_file) if args.label_file: labels = read_node_label(args.label_file) if args.modularity: print("Modularity") modularity(args, node_embeddings, args.min_k, args.max_k) if args.reconstruction: print("Graph reconstruction") reconstr(args, node_embeddings, args.k_nbrs) if args.clustering: print("Clustering") clustering(node_embeddings, labels, args.exp_times) if args.link_prediction: print("Link prediction") link_prediction(args.input, node_embeddings) if args.classification: X = list(labels.keys()) Y = list(labels.values()) print("Node classification") clf_ratio_list = args.clf_ratio.strip().split(',') result_list = {} train_ratio = np.asarray(range(1, 10)) * .1 for clf_ratio in train_ratio: # clf_ratio_list: result_per_test = [] for ti in range(args.exp_times): clf = Classifier(vectors=node_embeddings, clf=LogisticRegression()) myresult = clf.split_train_evaluate(X, Y, float(clf_ratio)) result_per_test.append(myresult) result_list[clf_ratio] = result_per_test print('-------------------') for clf_ratio in train_ratio: print('Train percent:', clf_ratio) results = result_list[clf_ratio] for index, result in enumerate(results): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in results: for metric, score in score_dict.items(): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(results) print('Average score:', dict(avg_score)) print('-------------------')
def node_classification( embeddings, label_path, name, size): X, Y = read_node_label( embeddings, label_path,) f_c=open('results/%s_classification_%d.txt'%(name, size), 'w') all_ratio=[] for tr_frac in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: print(" Training classifier using {:.2f}% nodes...".format(tr_frac * 100)) clf = Classifier(embeddings=embeddings, clf=LogisticRegression(), name=name) results= clf.split_train_evaluate(X, Y, tr_frac) avg='macro' f_c.write(name+' train percentage: '+ str(tr_frac)+ ' F1-'+avg+ ' '+ str('%0.5f'%results[avg])) all_ratio.append(results[avg]) f_c.write('\n')
def plot_embeddings(embeddings,): X, Y = read_node_label('../data/wiki/wiki_labels.txt') emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show()
def plot_embeddings( embeddings,label_path, name): X, Y = read_node_label( embeddings,label_path) emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1],label=c) # c=node_colors) plt.axis('off') plt.legend(loc= 'upper right', prop={'size': 15}, bbox_to_anchor=(1.15, 1), ncol=1) #plt.title('%s graph '%name) plt.savefig('%s_vis.pdf'%(name), bbox_inches='tight',dpi=100)
def main(args): print("xnetmf", "begin...") t1 = time.time() print("Reading...") nx_graph = nx.read_edgelist(agrs.input, nodetype=int, comments="%") adj_matrix = nx.adjacency_matrix(nx_graph).todense() print(adj_matrix) g = Graph(adj_matrix) rep_method = RepMethod( max_layer=2 ) # Learn representations with xNetMF. Can adjust parameters (e.g. as in REGAL) representations = src.xnetmf.get_representations(g, rep_method) print(representations) print(representations.shape) print("TAWD", "begin...") print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) g.read_node_label(args.label_file) g.read_node_features(args.feature_file) model = xtadw.TADW(graph=g, dim=args.representation_size, lamb=args.lamb) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output) vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format(args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)
def train(): folder = '../data/wiki/' X, Y = read_node_label(folder + 'labels.txt') node_fea = read_node_features(folder + 'wiki.features') node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt') all_sentence_ebd = np.load('../model/wiki/all_sentence_ebd.npy') print(all_sentence_ebd.shape) #print("sentence length", len(all_sentence_ebd[0][0])) all_reward = np.load('../model/wiki/all_reward.npy') average_reward = np.load('../model/wiki/average_reward.npy') g_rl = tf.Graph() sess2 = tf.Session(graph=g_rl) env = environment(500) with g_rl.as_default(): with sess2.as_default(): myAgent = agent(0.03, 500) updaterate = 1 num_epoch = 5 sampletimes = 3 best_reward = -100000 init = tf.global_variables_initializer() sess2.run(init) saver = tf.train.Saver() #saver.restore(sess2, save_path='rlmodel/rl.ckpt') # 对于需要训练的变量置位零 tvars_best = sess2.run(myAgent.tvars) for index, var in enumerate(tvars_best): tvars_best[index] = var * 0 # 保存历史的需要训练的变量 tvars_old = sess2.run(myAgent.tvars) # 梯度的置为零 gradBuffer = sess2.run(myAgent.tvars) for index, grad in enumerate(gradBuffer): gradBuffer[index] = grad * 0 g_rl.finalize() for epoch in range(num_epoch): all_list = list(range(len(all_sentence_ebd))) total_reward = [] # shuffle bags random.shuffle(all_list) # 对Bag进行shuffle for batch in tqdm.tqdm(all_list): #print("batch", batch) #for batch in tqdm.tqdm(range(10000)): # 取出来bag的实体对和对应的sentence,以及对应的reward bath_node = node_bag_list[batch] batch_sentence_ebd = all_sentence_ebd[batch] #print("batch_sentence_ebd", batch_sentence_ebd.shape) batch_reward = all_reward[batch] batch_len = len(batch_sentence_ebd) list_list_state = [] list_list_action = [] list_list_reward = [] avg_reward = 0 # add sample times for j in range(sampletimes): #reset environment # 环境的reset,返回当前state,历史state平均,实体对 state = env.reset(batch_sentence_ebd, batch_reward) #print('state shape' ,state[0].shape, state[1].shape) list_action = [] list_state = [] old_prob = [] #get action #start = time.time() for i in range(batch_len): state_in = np.append(state[0], state[1]) # print("state num", i) # print("state_in.shape", state_in.shape) feed_dict = {} #feed_dict[myAgent.node_seq] = [state[1]] feed_dict[myAgent.state_in] = [state_in] # 根据state计算概率,并根据概率选择action prob = sess2.run(myAgent.prob, feed_dict=feed_dict) # print("prob", prob) old_prob.append(prob[0]) action = get_action(prob) #add produce data for training cnn model # 把action和state进行记录,方便后续更新cnnmodel list_action.append(action) list_state.append(state) # 根据采取的action更新state state = env.step(action) #end = time.time() #print ('get action:',end - start) if env.num_selected == 0: tmp_reward = average_reward else: tmp_reward = env.reward() # 累加reward avg_reward += tmp_reward # 记录采取的action和reward list_list_state.append(list_state) list_list_action.append(list_action) list_list_reward.append(tmp_reward) avg_reward = avg_reward / sampletimes # add sample times for j in range(sampletimes): # 取出来上面进行探索的随影的action,state和reward list_state = list_list_state[j] list_action = list_list_action[j] reward = list_list_reward[j] # compute gradient # start = time.time() list_reward = [ reward - avg_reward for x in range(batch_len) ] list_state_in = [ np.append(state[0], state[1]) for state in list_state ] feed_dict = {} feed_dict[myAgent.state_in] = list_state_in feed_dict[myAgent.reward_holder] = list_reward feed_dict[myAgent.action_holder] = list_action ''' loss =sess2.run(myAgent.loss, feed_dict=feed_dict) if loss == float("-inf"): probs,pis = sess2.run([myAgent.prob,myAgent.pi], feed_dict=feed_dict) print(' ') print ('batch:',batch) print (old_prob) print (list_action) print(probs) print (pis) print('error!') return 0 ''' # 计算梯度 grads = sess2.run(myAgent.gradients, feed_dict=feed_dict) for index, grad in enumerate(grads): gradBuffer[index] += grad #end = time.time() #print('get loss and update:', end - start) ''' print (len(list_state),len(list_action),len(list_reward),len(list_entity1),len(list_entity2)) print (list_action) print (list_reward) print (list_entity1) print (list_entity2) break ''' #decide action and compute reward # reset环境 state = env.reset(batch_sentence_ebd, batch_reward) old_prob = [] for i in range(batch_len): # 决定action,计算reward state_in = np.append(state[0], state[1]) feed_dict = {} #feed_dict[myAgent.node_seq] = [state[1]] feed_dict[myAgent.state_in] = [state_in] prob = sess2.run(myAgent.prob, feed_dict=feed_dict) old_prob.append(prob[0]) action = decide_action(prob) state = env.step(action) chosen_reward = [ batch_reward[x] for x in env.list_selected ] total_reward += chosen_reward #apply gradient 计算梯度之后进行应用梯度 feed_dict = dictionary = dict( zip(myAgent.gradient_holders, gradBuffer)) sess2.run(myAgent.update_batch, feed_dict=feed_dict) for index, grad in enumerate(gradBuffer): gradBuffer[index] = grad * 0 #get tvars_new 计算最新的需要的变量 tvars_new = sess2.run(myAgent.tvars) # update old variables of the target network # 更新参数 tvars_update = sess2.run(myAgent.tvars) for index, var in enumerate(tvars_update): tvars_update[index] = updaterate * tvars_new[index] + ( 1 - updaterate) * tvars_old[index] feed_dict = dictionary = dict( zip(myAgent.tvars_holders, tvars_update)) sess2.run(myAgent.update_tvar_holder, feed_dict) tvars_old = sess2.run(myAgent.tvars) #break #find the best parameters chosen_size = len(total_reward) total_reward = np.mean(np.array(total_reward)) if (total_reward > best_reward): best_reward = total_reward tvars_best = tvars_old #print ('chosen sentence size:',chosen_size) #print ('total_reward:',total_reward) #print ('best_reward',best_reward) #set parameters = best_tvars feed_dict = dictionary = dict( zip(myAgent.tvars_holders, tvars_best)) sess2.run(myAgent.update_tvar_holder, feed_dict) #save model saver.save( sess2, save_path='../model/wiki/stne_transformer_model_rl_model.ckpt')
def select(save_path): folder = '../data/cora/' X, Y = read_node_label(folder + 'labels.txt') node_fea = read_node_features(folder + 'cora.features') node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt') all_sentence_ebd = np.load('../model/cora/all_sentence_ebd.npy') all_reward = np.load('../model/cora/all_reward.npy') average_reward = np.load('../model/cora/average_reward.npy') selected_seq = [] print("selected_seq") g_rl = tf.Graph() sess2 = tf.Session(graph=g_rl) env = environment(500) with g_rl.as_default(): with sess2.as_default(): myAgent = agent(0.02, 500) init = tf.global_variables_initializer() sess2.run(init) saver = tf.train.Saver() saver.restore(sess2, save_path=save_path) g_rl.finalize() for epoch in range(1): total_reward = [] num_chosen = 0 all_list = list(range(len(all_sentence_ebd))) for batch in tqdm.tqdm(all_list): batch_node = node_bag_list[batch] batch_sentence_ebd = all_sentence_ebd[batch] batch_reward = all_reward[batch] batch_len = len(batch_sentence_ebd) batch_seq = node_bag_seq[batch_node] # reset environment state = env.reset(batch_sentence_ebd, batch_reward) old_prob = [] # get action # start = time.time() for i in range(batch_len): state_in = np.append(state[0], state[1]) feed_dict = {} feed_dict[myAgent.state_in] = [state_in] prob = sess2.run(myAgent.prob, feed_dict=feed_dict) old_prob.append(prob[0]) action = decide_action(prob) # produce data for training cnn model state = env.step(action) if action == 1: num_chosen += 1 #print (old_prob) chosen_reward = [ batch_reward[x] for x in env.list_selected ] total_reward += chosen_reward selected_seq += [batch_seq[x] for x in env.list_selected] print(num_chosen) selected_seq = np.array(selected_seq) np.save('../model/cora/selected_seq.npy', selected_seq)
X_train_idx, Y_train, X_test_idx, Y_test, Y_all, testnum=5) print(res) if __name__ == '__main__': import os datafile = 'citeseer' edge_file = os.path.join("datasets", datafile, "graph.txt") label_file = os.path.join("datasets", datafile, "group.txt") single_label = True X, Y = read_node_label(label_file) G = read_graph_as_matrix(nodeids=X, edge_file=edge_file) removed_class = ['0', '1'] X_train_idx, X_test_idx, Y_train, Y_test, X_train_cid_idx, Y_train_cid = completely_imbalanced_split_train( X, Y, train_precent=0.5, removed_class=removed_class) print('completely-imbalanced train number', len(X_train_cid_idx)) vectors = run_RSDNE(G, X_train_cid_idx, Y_train_cid) res = evaluate_RSNDE(vectors, X_train_idx, X_test_idx, Y_train, Y_test, Y_all=Y)
def train(num_epoch, full_seq_name): folder = '../data/wiki/' X, Y = read_node_label(folder + 'labels.txt') node_fea = read_node_features(folder + 'wiki.features') node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_seq = read_bag_node_sequences(folder + 'node_sequences_10_10.txt') node_bag_list = read_bag_node_list(folder + 'node_sequences_10_10.txt') node_degree = read_node_degree(folder + 'node_degree.txt') all_sentence_ebd = np.load('../model/wiki/all_sentence_ebd.npy') print(all_sentence_ebd.shape) #print("sentence length", len(all_sentence_ebd[0][0])) all_reward = np.load('../model/wiki/all_reward.npy') average_reward = np.load('../model/wiki/average_reward.npy') g_stne = tf.Graph() g_rl = tf.Graph() sess1 = tf.Session(graph=g_stne) sess2 = tf.Session(graph=g_rl) with g_stne.as_default(): with sess1.as_default(): interact = stne_transformer_new_reward.interaction( sess1, save_path='../model/wiki/stne_model_transformer.ckpt') tvars_best_cnn = interact.tvars() for index, var in enumerate(tvars_best_cnn): tvars_best_cnn[index] = var * 0 g_stne.finalize() env = environment(500) best_score = -100000 with g_rl.as_default(): with sess2.as_default(): myAgent = agent(0.02, 500) updaterate = 0.01 #num_epoch = 10 sampletimes = 3 best_reward = -100000 init = tf.global_variables_initializer() sess2.run(init) saver = tf.train.Saver() saver.restore( sess2, save_path='../model/wiki/stne_transformer_model_rl_model.ckpt') tvars_best_rl = sess2.run(myAgent.tvars) for index, var in enumerate(tvars_best_rl): tvars_best_rl[index] = var * 0 tvars_old = sess2.run(myAgent.tvars) gradBuffer = sess2.run(myAgent.tvars) for index, grad in enumerate(gradBuffer): gradBuffer[index] = grad * 0 g_rl.finalize() trained_node_set = set() update_full_seq = [] for epoch in range(num_epoch): update_seq = [] all_list = list(range(len(all_sentence_ebd))) total_reward = [] # shuffle bags random.shuffle(all_list) print('update the rlmodel') for batch in tqdm.tqdm(all_list): #for batch in tqdm.tqdm(range(10000)): batch_node = node_bag_list[batch] batch_sentence_ebd = all_sentence_ebd[batch] batch_reward = all_reward[batch] batch_len = len(batch_sentence_ebd) batch_seq = node_bag_seq[batch_node] list_list_state = [] list_list_action = [] list_list_reward = [] avg_reward = 0 # add sample times for j in range(sampletimes): #reset environment state = env.reset(batch_sentence_ebd, batch_reward) list_action = [] list_state = [] old_prob = [] #get action #start = time.time() for i in range(batch_len): state_in = np.append(state[0], state[1]) feed_dict = {} feed_dict[myAgent.state_in] = [state_in] prob = sess2.run(myAgent.prob, feed_dict=feed_dict) old_prob.append(prob[0]) action = get_action(prob) ''' if action == None: print (123) action = 1 ''' #add produce data for training cnn model list_action.append(action) list_state.append(state) state = env.step(action) #end = time.time() #print ('get action:',end - start) if env.num_selected == 0: tmp_reward = average_reward else: tmp_reward = env.reward() avg_reward += tmp_reward list_list_state.append(list_state) list_list_action.append(list_action) list_list_reward.append(tmp_reward) avg_reward = avg_reward / sampletimes # add sample times for j in range(sampletimes): list_state = list_list_state[j] list_action = list_list_action[j] reward = list_list_reward[j] # compute gradient # start = time.time() list_reward = [ reward - avg_reward for x in range(batch_len) ] list_state_in = [ np.append(state[0], state[1]) for state in list_state ] feed_dict = {} feed_dict[myAgent.state_in] = list_state_in feed_dict[myAgent.reward_holder] = list_reward feed_dict[myAgent.action_holder] = list_action grads = sess2.run(myAgent.gradients, feed_dict=feed_dict) for index, grad in enumerate(grads): gradBuffer[index] += grad #end = time.time() #print('get loss and update:', end - start) #decide action and compute reward state = env.reset(batch_sentence_ebd, batch_reward) old_prob = [] for i in range(batch_len): state_in = np.append(state[0], state[1]) feed_dict = {} feed_dict[myAgent.state_in] = [state_in] prob = sess2.run(myAgent.prob, feed_dict=feed_dict) old_prob.append(prob[0]) action = decide_action(prob) state = env.step(action) chosen_reward = [ batch_reward[x] for x in env.list_selected ] total_reward += chosen_reward update_seq += [batch_seq[x] for x in env.list_selected] # if epoch == 0: # pass # else: # update_full_seq += [batch_seq[x] for x in env.list_selected] update_full_seq += [ batch_seq[x] for x in env.list_selected ] print('finished') #print (len(update_word),len(update_pos1),len(update_pos2),len(update_y),updaterate) #train and update cnnmodel print('update the stnemodel') interact.update_stne(update_seq, updaterate) print('finished') # classification result print('classification result') f1_mi = interact.classification_result() for f1 in f1_mi: print(f1) # classification new result # print('classification new result') # f1_mi_new = interact.classification_selected_result(trained_node_set, np.array(update_seq)) # for f1_new in f1_mi_new: # print(f1_new) # print('finished') #produce new embedding print('produce new embedding') average_reward, all_sentence_ebd, all_reward = interact.produce_new_embedding( ) np.save('../model/wiki/average_reward_new.npy', average_reward) np.save('../model/wiki/all_sentence_ebd_new.npy', all_sentence_ebd) np.save('../model/wiki/all_reward_new.npy', all_reward) average_score = average_reward print('finished') #update the rlmodel #apply gradient feed_dict = dictionary = dict( zip(myAgent.gradient_holders, gradBuffer)) sess2.run(myAgent.update_batch, feed_dict=feed_dict) for index, grad in enumerate(gradBuffer): gradBuffer[index] = grad * 0 #get tvars_new tvars_new = sess2.run(myAgent.tvars) # update old variables of the target network tvars_update = sess2.run(myAgent.tvars) for index, var in enumerate(tvars_update): tvars_update[index] = updaterate * tvars_new[index] + ( 1 - updaterate) * tvars_old[index] feed_dict = dictionary = dict( zip(myAgent.tvars_holders, tvars_update)) sess2.run(myAgent.update_tvar_holder, feed_dict) tvars_old = sess2.run(myAgent.tvars) #break #find the best parameters chosen_size = len(total_reward) total_reward = np.mean(np.array(total_reward)) if (total_reward > best_reward): best_reward = total_reward tvars_best_rl = tvars_old if average_score > best_score: best_score = average_score #tvars_best_rl = tvars_old print('epoch:', epoch) print('chosen seq size:', chosen_size) print('total_reward:', total_reward) print('best_reward', best_reward) print('average score', average_score) print('best score', best_score) #set parameters = best_tvars feed_dict = dictionary = dict( zip(myAgent.tvars_holders, tvars_best_rl)) sess2.run(myAgent.update_tvar_holder, feed_dict) #save model saver.save(sess2, save_path='../model/wiki/union_rl_model.ckpt') update_full_seq = np.array(update_full_seq) np.save(full_seq_name, update_full_seq) #interact.update_tvars(tvars_best_cnn) interact.save_stnemodel(save_path='../model/wiki/union_cnn_model.ckpt')
W[row, col] = 0 W[col, row] = 0 print('dissimilar type G', type(W), np.count_nonzero(W)) temp = (W + W.T) / 2.0 Lw = csgraph.laplacian(temp, normed=False) # L = D - temp return Lw if __name__ == '__main__': import os from classify import read_node_label from label_utils_functions import completely_imbalanced_split_train from basic_functions import get_data_file, read_graph_as_matrix edge_file, label_file, feature_file = get_data_file() X, Y = read_node_label(label_file) removed_class = ['0', '1'] X_train_idx, X_test_idx, Y_train, Y_test, X_train_cid_idx, Y_train_cid = completely_imbalanced_split_train( X, Y, 0.1, removed_class) print('train number', len(X_train_cid_idx)) '''node_num = len(X_train_idx) + len(X_test_idx) print('node num:', node_num) Ls = build_Ls_matrix(node_num, X_train_cid_idx, Y_train_cid, orgk=5) print(Ls)''' nodeids, nouse = read_node_label(label_file) G = read_graph_as_matrix(nodeids, edge_file) Lw = build_Lw_matrix(G, X_train_cid_idx, Y_train_cid) print(Lw) '''
def main(args): print("number-walks " + str(args.number_walks)) print("representation-size " + str(args.representation_size)) print("walk-length " + str(args.walk_length)) print("inout_fle " + str(args.input)) print("******") g = Graph() deepw = False #similarity thresholds for compration trsl = [0.45, 0.495, 0.5, 0.55, 0.6, 0.7, 0.8, 1] # trsl=[ 0.5 ] learn = True X, Y = read_node_label(args.label_file) seed = 0 clsratio = [0.01, 0.05, 0.07, 0.1, 0.25, 0.5, 0.7, 0.8] # clsratio=[ 0.1,0.2,0.4, 0.6,0.7,0.8,0.9]#,0.7,0.8]# use for blogcatalog np.random.seed(seed) shuffle_indices = np.random.permutation(np.arange(len(X))) f = open(args.input + "shu.txt", "w") f.writelines(str(item) + "\n" for item in shuffle_indices) f.close() if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) G = g.G print("before spar, n: " + str(len(G.nodes())) + " m: " + str(len(G.edges()))) #compute similarity score for compression t1 = time.time() p = pC(G, 0.45) scoreNode = p.ScoreCompute() t3 = time.time() f = open(args.input + "score.txt", "w") f.writelines( str(n[0]) + " " + str(n[1]) + " " + str(scoreNode[n]) + "\n" for n in scoreNode) f.close() print("total scorecom time: " + str(t3 - t1)) # read similarity scores from file # f=open(args.input+"score.txt","r") # scoreNode=dict() # for x in f: # l=x.split() # scoreNode[((l[0]),(l[1]))] = float(l[2]) for kk in range(0, len(trsl)): if learn: # do embeding ths = trsl[kk] #args.trs print("threshold is ...", ths) if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input, directed=args.directed) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) if ths != 1: #compression t1 = time.time() G = g.G G, nl2 = makeCompression(G, scoreNode, ths) f = open(args.input + "af_spar.txt", "w") f.writelines(str(n) + " " + str(nl2[n]) + "\n" for n in nl2) f.close() writeg(G, args) t2 = time.time() print("total_sparc_time: " + str(t2 - t1)) #embedding t1 = time.time() print("After_compresing,n,m " + str(len(g.G.nodes())) + " " + str(len(g.G.edges()))) model = node2vec.Node2vec(graph=g, path_length=args.walk_length, num_paths=args.number_walks, dim=args.representation_size, workers=args.workers, p=args.p, q=args.q, window=args.window_size, dw=deepw) t2 = time.time() print("total_embeding_time " + str(t2 - t1)) vectors = model.vectors if ths != 1: #add embedding of removed nodes in compression addBack(nl2, vectors) np.save(args.output + "_" + str(ths) + ".npy", vectors) else: #load embeddings vectors = np.load(args.output + "_" + str(ths) + ".npy") vectors = vectors.item(0) print("file_loaded") #print("Training classifier") #split_train_evaluate2 for single label (cora and wiki) #split_train_evaluate for multi lable (dblp and blogcatalog) for r in clsratio: clfa = Classifier(vectors, clf=LogisticRegression(solver='liblinear')) res = clfa.split_train_evaluate2( X, Y, r, shuffle_indices) # args.clf_ratio) print(str(r) + " " + str(res["macro"]) + " " + str(res["micro"]))
dropout = 0.2 # Dropout ration clf_ratio = [0.1, 0.2, 0.3, 0.4, 0.5] # Ration of training samples in subsequent classification # b_s = 128 # Size of batches lr = 0.001 # Learning rate of RMSProp keep_prob = 0.5 attention_size = 1000 # max_iters = 20000 print_every_k_iterations = 100 idx = 0 loss_1 = 0 loss_2 = 0 config = Config() start = time.time() fobj = open(fn, 'w') X, Y = read_node_label(folder + 'labels.txt') node_fea = read_node_features(folder + 'wiki.features') node_seq = read_node_sequences(folder + 'node_sequences_10_10.txt') nx_G = read_graph(folder + 'wiki.edgelist') nodes = nx_G.nodes() N = len(nodes) X_1 = read_node_features(folder + 'wiki.features') print(node_fea.shape[1], 'CCCCCCCCCCCCCCCCCCCC') # with tf.Session() as sess: model = STNE(config, hidden_dim=h_dim, nx_G=nx_G, X_1=X_1, seq_len=s_len, attention_size=100, depth=dpt, node_fea=node_fea, node_fea_trainable=trainable, node_num=node_fea.shape[0], fea_dim=node_fea.shape[1]) init = tf.global_variables_initializer() sess = tf.Session(config=config_tf) sess.run(init)
Author : haxu date: 2019/4/3 ------------------------------------------------- Change Activity: 2019/4/3: ------------------------------------------------- """ __author__ = 'haxu' import networkx as nx from deepwalk import DeepWalk from classify import read_node_label, Classifier from sklearn.linear_model import LogisticRegression if __name__ == '__main__': G = nx.read_edgelist('../data/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) model = DeepWalk(G, walk_length=30, num_walks=80, workers=4) model.train(window_size=5, iter=3) embeddings = model.get_embeddings() X, Y = read_node_label('../data/wiki_labels.txt') tr_frac = 0.8 clf = Classifier(embeddings=embeddings, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, tr_frac)
def train_stne(node_seq): folder = '../data/cora/' fn = '../data/cora/result.txt' dpt = 1 # Depth of both the encoder and the decoder layers (MultiCell RNN) h_dim = 500 # Hidden dimension of encoder LSTMs s_len = 10 # Length of input node sequence epc = 20 # Number of training epochs trainable = False # Node features trainable or not dropout = 0.2 # Dropout ration clf_ratio = [0.1, 0.2, 0.3, 0.4, 0.5 ] # Ration of training samples in subsequent classification b_s = 128 # Size of batches lr = 0.001 # Learning rate of RMSProp start = time.time() fobj = open(fn, 'w') X, Y = read_node_label(folder + 'labels.txt') node_fea = read_node_features(folder + 'cora.features') with tf.Session() as sess: model = STNE(hidden_dim=h_dim, node_fea_trainable=trainable, seq_len=s_len, depth=dpt, node_fea=node_fea, node_num=node_fea.shape[0], fea_dim=node_fea.shape[1], lr=0.001) #train_op = tf.train.RMSPropOptimizer(lr).minimize(model.loss_ce, global_step=model.global_step) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() trained_node_set = set() all_trained = False for epoch in range(epc): start_idx, end_idx = 0, b_s print('Epoch,\tStep,\tLoss,\t#Trained Nodes') while end_idx < len(node_seq): _, loss, step = sess.run( [model.train_op, model.loss_ce, model.global_step], feed_dict={ model.input_seqs: node_seq[start_idx:end_idx], model.dropout: dropout }) if not all_trained: all_trained = check_all_node_trained( trained_node_set, node_seq[start_idx:end_idx], node_fea.shape[0]) if step % 10 == 0: print(epoch, '\t', step, '\t', loss, '\t', len(trained_node_set)) # if all_trained: # f1_mi = [] # for ratio in clf_ratio: # f1_mi.append(node_classification(session=sess, bs=b_s, seqne=model, sequences=node_seq, # seq_len=s_len, node_n=node_fea.shape[0], samp_idx=X, # label=Y, ratio=ratio)) # print('step ', step) # fobj.write('step ' + str(step) + ' ') # for f1 in f1_mi: # print(f1) # fobj.write(str(f1) + ' ') # fobj.write('\n') start_idx, end_idx = end_idx, end_idx + b_s if start_idx < len(node_seq): sess.run( [model.train_op, model.loss_ce, model.global_step], feed_dict={ model.input_seqs: node_seq[start_idx:len(node_seq)], model.dropout: dropout }) minute = np.around((time.time() - start) / 60) print('\nepoch ', epoch, ' finished in ', str(minute), ' minutes\n') saver.save(sess, save_path="../model/cora/select_stne_model.ckpt")
dest="neurons_hiddenlayer", default=[128, 64, 32], type=list, help='Number of Neurons AE.') args = parser.parse_args() assert args.algorithm in { 'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax' } assert args.domain in {'restaurant', 'beer'} if args.seed > 0: np.random.seed(args.seed) ############################################################################################################################### ## Read Data X, Y = read_node_label(args.label_file) node_size = len(X) ############################################################################################################################### ## Building model sentence_input = Input(shape=(args.kstep, node_size), dtype='float32', name='sentence_inputt') neg_input = Input(shape=(args.neg_size, args.kstep, node_size), dtype='float32', name='neg_inputt') predict = Input(shape=(17, ), dtype='int32', name='predictt') e_w = sentence_input y_s = Average()(sentence_input)
def main(args): t1 = time.time() g = Graph() print("Reading...") if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) # if args.method == 'node2vec': # model = node2vec.Node2vec(graph=g, path_length=args.walk_length, # num_paths=args.number_walks, dim=args.representation_size, # workers=args.workers, p=args.p, q=args.q, window=args.window_size) # elif args.method == 'line': # if args.label_file and not args.no_auto_save: # model = line.LINE(g, epoch=args.epochs, rep_size=args.representation_size, order=args.order, # label_file=args.label_file, clf_ratio=args.clf_ratio) # else: # model = line.LINE(g, epoch=args.epochs, # rep_size=args.representation_size, order=args.order) # elif args.method == 'deepWalk': # model = node2vec.Node2vec(graph=g, path_length=args.walk_length, # num_paths=args.number_walks, dim=args.representation_size, # workers=args.workers, window=args.window_size, dw=True) # elif args.method == 'tadw': # # assert args.label_file != '' # assert args.feature_file != '' # g.read_node_label(args.label_file) # g.read_node_features(args.feature_file) # model = tadw.TADW( # graph=g, dim=args.representation_size, lamb=args.lamb) # elif args.method == 'gcn': # assert args.label_file != '' # assert args.feature_file != '' # g.read_node_label(args.label_file) # g.read_node_features(args.feature_file) # model = gcnAPI.GCN(graph=g, dropout=args.dropout, # weight_decay=args.weight_decay, hidden1=args.hidden, # epochs=args.epochs, clf_ratio=args.clf_ratio) # elif args.method == 'grarep': # model = GraRep(graph=g, Kstep=args.kstep, dim=args.representation_size) # elif args.method == 'lle': # model = lle.LLE(graph=g, d=args.representation_size) # elif args.method == 'hope': # model = hope.HOPE(graph=g, d=args.representation_size) if args.method == 'sdne': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) elif args.method == 'sdne_binary_loss': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) elif args.method == 'sdne_meta_path': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(g, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) # elif args.method == 'lap': # model = lap.LaplacianEigenmaps(g, rep_size=args.representation_size) # elif args.method == 'gf': # model = gf.GraphFactorization(g, rep_size=args.representation_size, # epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay) t2 = time.time() print('cost time is : {}'.format(t2 - t1)) if args.method != 'gcn': print("Saving embeddings...") model.save_embeddings(args.output) if args.label_file and args.method != 'gcn': vectors = model.vectors X, Y = read_node_label(args.label_file) print("Training classifier using {:.2f}% nodes...".format( args.clf_ratio * 100)) clf = Classifier(vectors=vectors, clf=LogisticRegression()) clf.split_train_evaluate(X, Y, args.clf_ratio)