def run(self): A_ = self.adj best_adj = copy.deepcopy(A_) adde = 0 Preprocess.savedata('{}_initadj.npz'.format(self.taskname), self.adj, self.features, self.labels) embds1, per1 = self.test(self.adj) init_performance = best_performance = per1 self.output('initial performace: {}, initial edges: {}'.format( init_performance, A_.nnz), f=True) early_it = 0 for i in range(self.tao): embd_, _ = self.test(A_) cans = self.cangen.cans(A_, embd_) edges_p = self.edgeEval.eval(cans, A_) new_adj, new_perf, addededgenum = self.edgeUpdate.update( edges_p, A_) self.output( 'time: {}, it: {}, performance: {}, init:{}, best:{}, added {} edges' .format(time.asctime(time.localtime(time.time())), i, new_perf, init_performance, best_performance, addededgenum), f=True) if new_perf < best_performance: early_it += 1 if early_it >= self.early_stop: self.output( '\nearly stop at it: {}, performance: {}, init: {}\n'. format(i, best_performance, init_performance), f=True) Preprocess.savedata( '{}_finaladj.npz'.format(self.taskname), best_adj, self.features, self.labels) break else: best_performance = new_perf best_adj = new_adj adde = addededgenum early_it = 0 A_ = new_adj init_un_perf = self.final_result(self.adj) unlabeled_perf = self.final_result(best_adj) nnz_init = self.adj.nnz nnz_final = best_adj.nnz self.output( 'init performace, test set {}, val set: {}, init {} edges'.format( init_un_perf, init_performance, nnz_init), f=True) self.output( 'final performace, test set {}, val set: {}, final {} edges, added {} edges' .format(unlabeled_perf, best_performance, nnz_final, (nnz_final - nnz_init) / 2), f=True) return best_adj, best_performance
def test(): # dirs = os.listdir(edgelistdir) for dataset in datasets: _adj, feature, label = Preprocess.loaddata( 'data/{}-default.npz'.format(dataset), llc=False) split_t = Preprocess.load_split( 'data/{}-default-split.pkl'.format(dataset)) ial = ialmodel_gcn(_adj, feature, label, split_t, sgc=usesgc) SGCtest(_adj, ial)
def __init__(self, Adj, features, labels, layersize=16, split_t=None, seed=-1, dropout=0.5, sGCN=False): # print('GCN model init') self.Adj = Adj self.features = features self.labels = labels _N = Adj.shape[0] _K = labels.max()+1 self._Z_obs = np.eye(_K)[labels] self.sizes = [layersize, _K] self.seed = seed self.dropout = dropout if sGCN: self.GCN = GCN_s else: self.GCN = GCN_n if split_t == None: self.split_train, self.split_val, self.split_unlabeled = Preprocess.splitdata(_N, self.labels) else: assert type(split_t) == tuple and len(split_t) == 3 self.split_train, self.split_val, self.split_unlabeled = split_t adj = utils.preprocess_graph(self.Adj) self.model = self.GCN(self.sizes, adj, self.features, "gcn_orig", gpu_id=None, seed=self.seed, params_dict={'dropout': self.dropout})
indir = '/root/hux/npz/' # indir = dirprefix outdir = '/root/hux/data/' dataset = 'cora' if __name__ == "__main__": dirs = os.listdir(indir) for p in percents: count = 0 for d in dirs: print(d) a = d.find(p) if a == -1: continue ff = outdir + dataset if d.find('init') != -1: ff += 'init-{}+{}'.format(p, count) else: ff += 'final-{}+{}'.format(p, count) print(ff) count += 1 adj, feature, label = Preprocess.loaddata(indir + d) adj = adj.tolil() for j in range(feature.shape[0]): if not adj[j].nonzero(): adj[j, j] = 1 savedata(adj.tocsr(), feature, label, ff)
pkldir = [ '/Users/davidhu/Desktop/pkl/cora-25/', '/Users/davidhu/Desktop/pkl/cora-50/', '/Users/davidhu/Desktop/pkl/cora-75/', '/Users/davidhu/Desktop/pkl/cora-100/', ] ramainsize = [0.7554,0.9375,1.04657,1.102] ramainsize_ci = [0.88,0.8921,0.6841,0.985] if __name__ == "__main__": # adj, features, labels = Preprocess.loaddata('data/{}.npz'.format(dataset), llc=False) # _A_prev = _A_obs evaltimes = 20 adj, features, labels = Preprocess.loaddata('data/{}-default.npz'.format(dataset)) train, val, test = split_t = Preprocess.load_split('data/{}-default-split.pkl'.format(dataset)) ial = ialmodel_gcn(adj, features, labels, split_t) for ii,ds in enumerate(pkldir): if ii == 0 or ii == 1: continue print('dir name {}'.format(ds)) pkl_loader = file_loader_generator(ds) try: while True: adj_p = next(pkl_loader) mask = upper_triangular_mask(adj_p.shape) actual = delll = 0 results = [] for i in range(evaltimes): # adj_sam = top(a, mask, 2714+500*9)
def run(self): init_test_perf, init_val_perf, _ = self.gemodel.multitest(self.adj) init_performance = best_performance = init_val_perf self.output('init performace, test set {}, val set: {}'.format( init_test_perf, init_val_perf), f=True) if self.disturbadj_before != None: A_, _ = self.deleteedges() else: A_ = self.adj best_adj = copy.deepcopy(A_) Preprocess.savedata('{}_initadj.npz'.format(self.taskname), self.adj, self.features, self.labels) # init_performance = best_performance = 0 early_stop_best = 0 early_stop_it = 0 early_it = 0 perfs = [] for i in range(self.tao): if self.cg == 'ran': embd_ = 0 else: embd_, _ = self.gemodel.singleTrain(A_) t1 = time.time() cans = self.cangen.cans(A_, embd_, self.edgesunadded) t2 = time.time() edges_p = self.edgeEval.eval(cans, A_) t3 = time.time() new_adj, new_perf, addededges = self.edgeUpdate.update(edges_p, A_) t4 = time.time() for e in addededges: self.edgesunadded.discard(e) print('self.edges unadded len: {}'.format(len(self.edgesunadded))) test_perf, val_perf, train_perf = self.gemodel.multitest(new_adj) t5 = time.time() print('time consuming: {} {} {} {}'.format(t2 - t1, t3 - t2, t4 - t3, t5 - t4)) self.output( 'time: {}, test res: {}, val res: {}, train res: {}'.format( time.asctime(time.localtime(time.time())), test_perf, val_perf, train_perf), f=True) self.output( 'time: {}, it: {}, performance: {}, init:{}, best:{}, added {} edges' .format(time.asctime(time.localtime(time.time())), i, val_perf, init_performance, best_performance, (new_adj.nnz - self.adj.nnz) / 2), f=True) if val_perf <= best_performance: early_it += 1 if early_stop_best == 0 and early_it >= self.early_stop: early_stop_it = i early_stop_best = best_performance self.output( '\nearly stop at it: {}, performance: {}, init: {}\n'. format(i, best_performance, init_performance), f=True) # break else: best_performance = val_perf best_adj = new_adj if early_stop_best == 0: early_it = 0 if early_stop_best != 0 and (new_adj.nnz - self.adj.nnz) / 2 >= self.minedges: break A_ = new_adj # unlabeled_perf, val_perf, train_perf = self.final_result(best_adj) unlabeled_perf, val_perf, train_perf = self.gemodel.multitest(best_adj) nnz_init = self.adj.nnz nnz_final = best_adj.nnz self.output( 'init performace, test set {}, val set: {}, init {} edges'.format( init_test_perf, init_val_perf, nnz_init), f=True) self.output( 'final performace, test set {}, val set: {}, final {} edges, added {} edges' .format(unlabeled_perf, best_performance, nnz_final, (nnz_final - nnz_init) / 2), f=True) Preprocess.savedata('{}_finaladj.npz'.format(self.taskname), best_adj, self.features, self.labels) return best_adj, best_performance
def __init__(self, adj, features, labels, tao, minedges, randomedgenum=1000, gemodel='GCN', cangen='knn', edgeEval='max', edgeUpdate='easy', early_stop=20, seed=-1, dropout=0.5, deleted_edges=None, completeadj=None, disturbadj_before=None, params=None, dataset=('cora', 1), testindex=1, split_share=(0.1, 0.1), expectEdgeNum=-1, spaceF=10, simtype='node', split_seed=-1, poolnum=2): ''' args: adj: init adj matrix, N*N feature: N*D tao: iter times n: candidate patch size s: one patch size params: (edgenumPit2add, cannumPit, knn, subsetnum) e2a, cand, knn, se ''' self.adj = adj self.adjlil = self.adj.copy().tolil() self.features = features self.tao = tao self.labels = labels self.early_stop = early_stop self.seed = seed self.deleted_edges = deleted_edges self.dropout = dropout self.split_share = split_share self.randomedgenum = randomedgenum self.cg = cangen self.minedges = minedges self.disturbadj_before = disturbadj_before if params == None: self.params = (20, 20, 20, 20, 5) else: self.params = params self.edgenumPit2add, self.seedEdgeNum, self.knn, self.subsetnum, self.evalPerEdge = self.params self.poolnum = poolnum print('iterAddlinks: params:{} start'.format(self.params)) timenow = time.asctime(time.localtime(time.time())) self.taskname = 'ial_res_{}_{}_{}_{}_{}'.format( dataset, edgeEval, self.params, testindex, timenow) self.outfile = open('{}.txt'.format(self.taskname), 'w') split_ss = 123 if split_seed == -1 else split_seed _N = self.adj.shape[0] self.split_train, self.split_val, self.split_unlabeled = Preprocess.splitdata( _N, self.labels, seed=split_ss, share=self.split_share) if edgeEval == 'SGC': self.sgc_val = self.split_val[int(len(self.split_val) / 2):] self.split_val = self.split_val[:int(len(self.split_val) / 2)] self.split_t = (self.split_train, self.split_val, self.split_unlabeled) if gemodel == 'GCN': self.gemodel = ialmodel_gcn(self.adj, self.features, self.labels, self.split_t) elif isinstance(gemodel, ialmodel): self.gemodel = gemodel else: print('wrong gemodel, expected type ialmodel, actually type {}'. format(type(gemodel))) exit(0) if completeadj != None: # testp, valp, trainp = self.final_result(initadj) testp, valp, trainp = self.gemodel.multitest(completeadj) self.output( 'complete adj performance test: {}, val: {}, train: {}'.format( testp, valp, trainp), f=True) if disturbadj_before != None: testp, valp, trainp = self.gemodel.multitest(disturbadj_before) self.output( 'disturbed before adj performance test: {}, val: {}, train: {}' .format(testp, valp, trainp), f=True) if cangen == 'knn': self.cangen = canGen_knn(self.seedEdgeNum, self.poolnum, self.knn, simtype=simtype) elif cangen == 'ran': self.cangen = canGen_ran(self.randomedgenum, _N) else: self.output('cangen params err') exit(0) if edgeEval == 'max': self.edgeEval = edgeEval_max(self.adj, self.features, self.labels, self.split_t, self.poolnum, self.knn, self.evalPerEdge, seed=self.seed, dropout=self.dropout) elif edgeEval == 'SGC': self.edgeEval = edgeEval_SGC(self.adj, self.features, self.labels, self.split_t, self.sgc_val, poolnum=self.poolnum) else: self.output('edgeeval params err') exit(0) if edgeUpdate == 'easy': self.edgeUpdate = edgesUpdate_easy(self.adj, self.features, self.labels, self.split_t, self.edgenumPit2add, self.poolnum, self.subsetnum, self.seed, self.dropout, expectEdgeNum=expectEdgeNum, spaceF=spaceF) elif edgeUpdate == 'topK': self.edgeUpdate = edgesUpdate_k(self.edgenumPit2add) else: self.output('edgeUpdation params err') exit(0) self.edgesunadded = set() if _N > 5000: kf = int((_N * _N) / 10000000) for i in range(_N): s = random.randint(1, kf) for j in range(i + s, _N, kf): self.edgesunadded.add((i, j)) else: for i in range(_N): for j in range(i + 1, _N): self.edgesunadded.add((i, j)) t = self.adj.nonzero() rows = t[0] cols = t[1] print('prev unadded edges size: {}'.format(len(self.edgesunadded))) for i in range(len(rows)): self.edgesunadded.discard((rows[i], cols[i])) print('after unadded edges size: {}'.format(len(self.edgesunadded)))
def deleteedges(self): # A_del, _, __= spa.delete_edges(self.adj, k=1) print('delete edges begin') A_del = sp.csr_matrix(([], ([], [])), shape=self.adj.shape) edges2add = set() t = self.adj.nonzero() rows = t[0] cols = t[1] for i in range(len(rows)): if rows[i] <= cols[i]: edges2add.add((rows[i], cols[i])) init_test_perf, init_val_perf, _ = self.gemodel.multitest(A_del) init_performance = best_performance = init_val_perf self.output('init performace, test set {}, val set: {}'.format( init_test_perf, init_val_perf), f=True) early_stop_best = 0 early_stop_it = 0 early_it = 0 perfs = [] for i in range(self.tao): if self.cg == 'ran': embd_ = 0 else: embd_, _ = self.gemodel.singleTrain(A_del) cans = list(edges2add) edges_p = self.edgeEval.eval(cans, A_del) new_adj, new_perf, addededges = self.edgeUpdate.update(edges_p, A_del, p=0.1) for e in addededges: edges2add.discard(e) print('self.edges unadded len: {}'.format(len(edges2add))) test_perf, val_perf, train_perf = self.gemodel.multitest(new_adj) self.output( 'time: {}, test res: {}, val res: {}, train res: {}'.format( time.asctime(time.localtime(time.time())), test_perf, val_perf, train_perf), f=True) self.output( 'time: {}, it: {}, performance: {}, init:{}, best:{}, added {} edges' .format(time.asctime(time.localtime(time.time())), i, val_perf, init_performance, best_performance, (new_adj.nnz) / 2), f=True) if val_perf <= best_performance: early_it += 1 if early_stop_best == 0 and early_it >= self.early_stop: early_stop_it = i early_stop_best = best_performance self.output( '\nearly stop at it: {}, performance: {}, init: {}\n'. format(i, best_performance, init_performance), f=True) # break else: best_performance = val_perf best_adj = new_adj if early_stop_best == 0: early_it = 0 if early_stop_best != 0 and (new_adj.nnz) / 2 >= self.minedges: break A_del = new_adj unlabeled_perf, val_perf, train_perf = self.gemodel.multitest(best_adj) nnz_final = best_adj.nnz self.output( 'init performace, test set {}, val set: {}, init {} edges'.format( init_test_perf, init_val_perf, 0), f=True) self.output( 'final performace, test set {}, val set: {}, final {} edges, added {} edges' .format(unlabeled_perf, best_performance, nnz_final, (nnz_final) / 2), f=True) Preprocess.savedata( '{}_delete_edges_finaladj.npz'.format(self.taskname), best_adj, self.features, self.labels) return best_adj, best_performance
def __init__(self, adj, features, labels, tao, n, s, gemodel='GCN', edge_Rec='MLE', trainsize=0.5, early_stop=10, seed=-1, dropout=0.5, deleted_edges=None, initadj=None, params=None, dataset=('cora', 1), testindex=1): ''' args: adj: init adj matrix, N*N feature: N*D tao: iter times n: candidate patch size s: one patch size params: (edgenumPit2add, cannumPit, knn, subsetnum) e2a, cand, knn, se ''' self.adj = adj self.features = features self.tao = tao self.n = n self.s = s self.labels = labels self.trainsize = trainsize self.early_stop = early_stop self.seed = seed self.deleted_edges = deleted_edges self.dropout = dropout if params == None: self.params = (20, 20, 20, 20, 5) else: self.params = params print('iterAddlinks: params:{} start'.format(self.params)) self.outfile = open( 'ial_res_{}_{}_{}_{}.txt'.format(dataset, edge_Rec, self.params, testindex), 'w') _N = self.adj.shape[0] self.split_train, self.split_val, self.split_unlabeled = Preprocess.splitdata( _N, self.labels) if initadj != None: e, p = self.test(initadj) self.output('complete adj performance: {}'.format(p), f=True) if gemodel == None: self.gemodel = model_i() elif gemodel == 'GCN': # self.gemodel = gemodel_GCN(self.adj, self.features, self.labels, seed=self.seed, dropout=0) self.gemodel = None else: print('ERR: wrong graph embedding class') exit(-1) if edge_Rec == 'rand': self.edgeRecMethod = addEdges_random() elif edge_Rec == 'rand_test': self.edgeRecMethod = addEdges_random_test( self.features, self.labels, self.split_train, self.split_val, self.split_unlabeled, self.deleted_edges, self.seed) elif edge_Rec == 'MLE': self.edgeRecMethod = addEdges_MLE(self.features, self.labels, self.split_train, self.split_val, self.split_unlabeled) elif edge_Rec == 'KNN': self.edgeRecMethod = addEdges_KNN(self.features, self.labels, self.split_train, self.split_val, self.split_unlabeled, hyperp=self.params) else: print('ERR: wrong edge reconstruction class') exit(-1)
dataset = 'cora' percent = 0.5 share = (0.052, 0.3693) ps = [0.1, 0.25, 0.5, 0.75, 1] edgenumPit2adds = [10, 20, 50, 100] cannumpits = [20, 50, 100, 200, 500] knns = [20, 50, 100] subsetevalnum = [20, 50, 100, 200, 300] etimeperedge = [5, 10, 20, 30] hps = [edgenumPit2adds, cannumpits, knns, subsetevalnum, etimeperedge] if __name__ == "__main__": _A_obs, feas, labels = Preprocess.loaddata('data/{}.npz'.format(dataset), llc=False) _A_prev = _A_obs # adj, remained, deleted = spa.delete_edges(_A_obs, k=percent) # print('preprocess, delete some edges, remaind edges num(bi): {}'.format(adj.nnz)) # t = IALGE(adj, feas, labels, 100, 10, 10, edge_Rec='rand') # t = IALGE(adj, feas, labels, 100, 10, 10, seed=1, dropout=0) # t = IALGE(adj, feas, labels, 100, 10, 10, seed=1, dropout=0, edge_Rec='rand_test', deleted_edges=deleted, gemodel=None) # t = IALGE(adj, feas, labels, 100, 10, 10, seed=1, dropout=0, edge_Rec='MLE', deleted_edges=deleted, gemodel=None) # t = IALGE(adj, feas, labels, 100, 10, 10) def testhp(index=1, testtimes=3): hp = [20, 20, 50, 50, 10] ds = (dataset, percent)
def __init__(self, adj, features, labels, tao, n, s, gemodel='GCN', cangen='knn', edgeEval='max', edgeUpdate='easy', early_stop=20, seed=-1, dropout=0.5, deleted_edges=None, initadj=None, params=None, dataset=('cora', 1), testindex=1, split_share=(0.1, 0.1)): ''' args: adj: init adj matrix, N*N feature: N*D tao: iter times n: candidate patch size s: one patch size params: (edgenumPit2add, cannumPit, knn, subsetnum) e2a, cand, knn, se ''' self.adj = adj self.features = features self.tao = tao self.n = n self.s = s self.labels = labels self.early_stop = early_stop self.seed = seed self.deleted_edges = deleted_edges self.dropout = dropout self.split_share = split_share if params == None: self.params = (20, 20, 20, 20, 5) else: self.params = params self.edgenumPit2add, self.seedEdgeNum, self.knn, self.subsetnum, self.evalPerEdge = self.params self.poolnum = 20 print('iterAddlinks: params:{} start'.format(self.params)) timenow = time.asctime(time.localtime(time.time())) self.taskname = 'ial_res_{}_{}_{}_{}_{}'.format( dataset, edgeEval, self.params, testindex, timenow) self.outfile = open('{}.txt'.format(self.taskname), 'w') _N = self.adj.shape[0] self.split_train, self.split_val, self.split_unlabeled = Preprocess.splitdata( _N, self.labels, share=self.split_share) self.split_t = (self.split_train, self.split_val, self.split_unlabeled) if initadj != None: e, p = self.test(initadj) self.output('complete adj performance: {}'.format(p), f=True) # if gemodel == None: # self.gemodel = model_i() # elif gemodel == 'GCN': # # self.gemodel = gemodel_GCN(self.adj, self.features, self.labels, seed=self.seed, dropout=0) # self.gemodel = None # else: # print('ERR: wrong graph embedding class') # exit(-1) if cangen == 'knn': self.cangen = canGen_knn(self.seedEdgeNum, self.poolnum, self.knn) else: self.output('cangen params err') exit(0) if edgeEval == 'max': self.edgeEval = edgeEval_max(self.adj, self.features, self.labels, self.split_t, self.poolnum, self.knn, self.evalPerEdge, seed=self.seed, dropout=self.dropout) else: self.output('edgeeval params err') exit(0) if edgeUpdate == 'easy': self.edgeUpdate = edgesUpdate_easy(self.adj, self.features, self.labels, self.split_t, self.edgenumPit2add, self.poolnum, self.subsetnum, self.seed, self.dropout) else: self.output('edgeUpdation params err') exit(0)
import copy from disturbEdges import distEdge_ran warnings.filterwarnings("ignore") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' dataset = 'cora' # dataset = 'pubmed' # dataset = 'polblogs' # dataset = 'citeseer' percent = 0.85 stype = 'node' share = (0.052, 0.3693) if dataset == 'cora' else (0.0362, 0.3006) if dataset == 'pubmed': share = (0.003, 0.05) _A_obs, feas, labels = Preprocess.loaddata('data/{}.npz'.format(dataset), llc=False) _N = _A_obs.shape[0] split_train, split_val, split_unlabeled = split_t = Preprocess.splitdata( _N, labels, seed=123, share=share) distnum = 1000 dst = distEdge_ran(distnum) deletesizes = [1, 0.8, 0.5, 0.2] res = [] labelk = [] fo = '{}/{}' for i in range(len(deletesizes)): adj, remained, deleted = spa.delete_edges(_A_obs, k=deletesizes[i]) adj_d = dst.disturb(adj)
import time warnings.filterwarnings("ignore") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' dataset = 'cora' percent = 0.5 def run_time(func, *args, **kw): start_time = time.time() func(*args, **kw) end_time = time.time() print('run time: {}s'.format(end_time-start_time)) if __name__ == "__main__": _A_obs, feas, labels = Preprocess.loaddata( 'data/{}.npz'.format(dataset), llc=False) _A_prev = _A_obs adj, remained, deleted = spa.delete_edges(_A_obs, k=percent) _N = _A_prev.shape[0] # split_train, split_val, split_unlabeled = Preprocess.splitdata(_N, labels) #seed share as default # split_t = (split_train, split_val, split_unlabeled) # gcn = gemodel_GCN(_A_prev, feas, labels, split_t=split_t, seed=1, dropout=0) # run_time(gcn.train) # print('performance: {}, acu: {}'.format(gcn.performance(), gcn.acu())) # gcn = gemodel_GCN(adj, feas, labels, split_t=split_t, seed=1, dropout=0) # gcn.train() # print('performance: {}, acu: {}'.format(gcn.performance(), gcn.acu())) split_train, split_val, split_unlabeled = Preprocess.splitdata(_N, labels, seed=12, share=(0.052, 0.3693)) #seed share as default