def get_degree_feature_list(self, origin_base_path, start_idx, duration, sep='\t', init_type='gaussian', std=1e-4): assert init_type in ['gaussian', 'adj', 'combine', 'one-hot'] x_list = [] max_degree = 0 adj_list = [] degree_list = [] date_dir_list = sorted(os.listdir(origin_base_path)) # find the maximal degree for a list of graphs for i in range(start_idx, min(start_idx + duration, self.max_time_num)): original_graph_path = os.path.join(origin_base_path, date_dir_list[i]) adj = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep) adj_list.append(adj) degrees = adj.sum(axis=1).astype(np.int) max_degree = max(max_degree, degrees.max()) degree_list.append(degrees) # generate degree_based features input_dim = 0 for i, degrees in enumerate(degree_list): # other structural feature initialization techniques can also be tried to improve performance if init_type == 'gaussian': fea_list = [] for degree in degrees: fea_list.append(np.random.normal(degree, std, max_degree + 1)) fea_arr = np.array(fea_list).astype(np.float32) input_dim = fea_arr.shape[1] fea_tensor = torch.from_numpy(fea_arr).float() x_list.append(fea_tensor.cuda() if self.has_cuda else fea_tensor) elif init_type == 'adj': input_dim = self.node_num feat_tensor = sparse_mx_to_torch_sparse_tensor(adj_list[i]) x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor) elif init_type == 'combine': fea_list = [] for degree in degrees: fea_list.append(np.random.normal(degree, std, max_degree + 1)) sp_feat = sp.coo_matrix(np.array(fea_list)) sp_feat = sp.hstack((sp_feat, adj_list[i])).astype(np.float32) input_dim = sp_feat.shape[1] feat_tensor = sparse_mx_to_torch_sparse_tensor(sp_feat) x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor) else: # one-hot degree feature data = np.ones(degrees.shape[0], dtype=np.int) row = np.arange(degrees.shape[0]) col = degrees.flatten().A[0] spmat = sp.csr_matrix((data, (row, col)), shape=(degrees.shape[0], max_degree + 1)) sptensor = sparse_mx_to_torch_sparse_tensor(spmat) x_list.append(sptensor.cuda() if self.has_cuda else sptensor) # print('max degree: ', max_degree + 1) input_dim = max_degree + 1 return x_list, input_dim
def _preprocess_adj(self, normalization, adj, cuda): adj_normalizer = fetch_normalization(normalization) r_adj = adj_normalizer(adj) r_adj = sparse_mx_to_torch_sparse_tensor(r_adj).float() if cuda: r_adj = r_adj.cuda() return r_adj
def build_cluster_adj(self, clean=False): """ build a adjacency matrix which only record what kind of fake labels each node link to """ adj = np.zeros((self.n_nodes, self.n_clusters), dtype=np.float64) for dst, src in self.edges.tolist(): adj[src, self.fake_labels[dst]] += 1 adj[dst, self.fake_labels[src]] += 1 if self.mode in ('clusteradj') and not clean: adj += get_noise(self.args.noise_type, self.n_nodes, self.n_clusters, self.args.noise_seed, eps=self.args.epsilon, delta=self.args.delta) adj = np.clip(adj, a_min=0, a_max=None) adj = normalize(adj) return torch.FloatTensor(adj) adj = sp.coo_matrix(adj) adj = normalize(adj) return sparse_mx_to_torch_sparse_tensor(adj)
def evaluate_test(model, g, inputs, labels, test_mask, lp_dict, coeffs, meta): model.eval() with torch.no_grad(): pred = model(g, inputs).squeeze() output = pred.cuda() labels = labels.cuda() idx_test = lp_dict['idx_test'] idx_train = lp_dict['idx_train'] adj = sparse_mx_to_torch_sparse_tensor(normalize(lp_dict['sp_adj'])) #print(adj.to_dense()[np.arange(100), np.arange(100)+1]) labels, output, adj = labels.cpu(), output.cpu(), adj.cpu() loss = F.mse_loss(output[idx_test].squeeze(), labels[idx_test].squeeze()) r2_test = compute_r2(output[idx_test], labels[idx_test]) lp_output = lp_refine(idx_test, idx_train, labels, output, adj, torch.tanh(coeffs[0]).item(), torch.exp(coeffs[1]).item()) lp_r2_test = compute_r2(lp_output, labels[idx_test]) lp_output_raw_cov = lp_refine(idx_test, idx_train, labels, output, adj) lp_r2_test_raw_cov = compute_r2(lp_output_raw_cov, labels[idx_test]) print("------------") print("election year {}".format(meta)) print("loss:", loss.item()) print("raw_r2:", r2_test) print("refined_r2:", lp_r2_test) print("refined_r2_raw_cov:", lp_r2_test_raw_cov) print("------------")
def evaluate_test(model, g, inputs, labels, test_mask, batch_size, device, lp_dict, meta): model.eval() with th.no_grad(): pred = model.inference(g, inputs, batch_size, device).view(-1) output = pred.to(device) labels = labels.to(device) idx_test = lp_dict['idx_test'] idx_train = lp_dict['idx_train'] adj = sparse_mx_to_torch_sparse_tensor(normalize(lp_dict['sp_adj'])) labels, output, adj = labels.cpu(), output.cpu(), adj.cpu() loss = F.mse_loss(output[idx_test].squeeze(), labels[idx_test].squeeze()) r2_test = compute_r2(output[test_mask], labels[test_mask]) lp_output = lp_refine(idx_test, idx_train, labels, output, adj) lp_r2_test = compute_r2(lp_output, labels[idx_test]) print("------------") print("election year {}".format(meta)) print("loss:", loss.item()) print("raw_r2:", r2_test) print("refined_r2:", lp_r2_test) print("------------") model.train()
def get_core_adj_list(self, core_base_path, start_idx, duration, max_core=-1): date_dir_list = sorted(os.listdir(core_base_path)) time_stamp_num = len(date_dir_list) assert start_idx < time_stamp_num core_adj_list = [] for i in range(start_idx, min(start_idx + duration, self.max_time_num)): date_dir_path = os.path.join(core_base_path, date_dir_list[i]) f_list = sorted(os.listdir(date_dir_path)) core_file_num = len(f_list) tmp_adj_list = [] if max_core == -1: max_core = core_file_num f_list = f_list[:max_core] # select 1 core to max core f_list = f_list[::-1] # reverse order, max core, (max - 1) core, ..., 1 core # get k-core adjacent matrices at the i-th timestamp spmat_list = [] for j, f_name in enumerate(f_list): spmat = sp.load_npz(os.path.join(date_dir_path, f_name)) spmat_list.append(spmat) if j == 0: spmat = spmat + sp.eye(spmat.shape[0]) else: delta = spmat - spmat_list[j - 1] # reduce subsequent computation complexity and reduce memory cost! if delta.sum() == 0: # reduce computation complexity and memory cost! continue # Normalization will reduce the self weight, hence affect its performance! So we omit normalization. sptensor = sparse_mx_to_torch_sparse_tensor(spmat) tmp_adj_list.append(sptensor.cuda() if self.has_cuda else sptensor) # print('time: ', i, 'core len: ', len(tmp_adj_list)) core_adj_list.append(tmp_adj_list) return core_adj_list
def test(model, dataset, cfg, logger): if cfg.load_from: logger.info('load from {}'.format(cfg.load_from)) load_checkpoint(model, cfg.load_from, strict=True, logger=logger) features = torch.FloatTensor(dataset.features) adj = sparse_mx_to_torch_sparse_tensor(dataset.adj) if not dataset.ignore_label: labels = torch.FloatTensor(dataset.labels) if cfg.cuda: model.cuda() features = features.cuda() adj = adj.cuda() labels = labels.cuda() model.eval() output, gcn_feat = model((features, adj), output_feat=True) if not dataset.ignore_label: loss = F.mse_loss(output, labels) loss_test = float(loss) logger.info('[Test] loss = {:.4f}'.format(loss_test)) pred_confs = output.detach().cpu().numpy() gcn_feat = gcn_feat.detach().cpu().numpy() return pred_confs, gcn_feat
def get_feature_list(self, feature_base_path, start_idx, duration, sep='\t', shuffle=False): if feature_base_path is None: x_list = [] for i in range(start_idx, min(start_idx + duration, self.max_time_num)): if shuffle: node_indices = np.random.permutation(np.arange(self.node_num)) if shuffle else np.arange(self.node_num) spmat = sp.coo_matrix((np.ones(self.node_num), (np.arange(self.node_num), node_indices)), shape=(self.node_num, self.node_num)) else: spmat = sp.eye(self.node_num) sptensor = sparse_mx_to_torch_sparse_tensor(spmat) x_list.append(sptensor.cuda() if self.has_cuda else sptensor) input_dim = self.node_num else: feature_file_list = sorted(os.listdir(feature_base_path)) x_list = [] feature_arr_list = [] max_feature_dim = 0 # calculate max feature dimension for i in range(start_idx, min(start_idx + duration, self.max_time_num)): feature_file_path = os.path.join(feature_base_path, feature_file_list[i]) df_feature = pd.read_csv(feature_file_path, sep=sep, header=0) max_feature_dim = max(max_feature_dim, df_feature.shape[1]) feature_arr = df_feature.values feature_arr_list.append(feature_arr) # expand feature matrix into the same dimension for feature_arr in feature_arr_list: batch_dim, feature_dim = feature_arr.shape expand_feature_arr = np.hstack((feature_arr, np.zeros((batch_dim, max_feature_dim - feature_dim)))).astype(np.float32) fea_tensor = torch.from_numpy(expand_feature_arr).float() x_list.append(fea_tensor.cuda() if self.has_cuda else fea_tensor) input_dim = max_feature_dim return x_list, input_dim
def _single_train(model, dataset, cfg, logger): if cfg.gpus > 1: raise NotImplemented # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) features = torch.FloatTensor(dataset.features) adj = sparse_mx_to_torch_sparse_tensor(dataset.adj) labels = torch.FloatTensor(dataset.labels) if cfg.cuda: model.cuda() features = features.cuda() adj = adj.cuda() labels = labels.cuda() train_data = [[features, adj, labels]] runner.run(train_data, cfg.workflow, cfg.total_epochs)
def evaluate_test(model, features, labels, test_mask, lp_dict, coeffs, meta="2012"): model.eval() with torch.no_grad(): output = model(features).squeeze() output = output.cuda() labels = labels.cuda() idx_test = lp_dict['idx_test'] idx_train = lp_dict['idx_train'] adj = sparse_mx_to_torch_sparse_tensor(normalize(lp_dict['sp_adj'])) labels, output, adj = labels.cpu(), output.cpu(), adj.cpu() loss = F.mse_loss(output[idx_test].squeeze(), labels[idx_test].squeeze()) r2_test = compute_r2(output[idx_test], labels[idx_test]) lp_output = lp_refine(idx_test, idx_train, labels, output, adj, torch.tanh(coeffs[0]).item(), torch.exp(coeffs[1]).item()) lp_r2_test = compute_r2(lp_output, labels[idx_test]) lp_output_raw_conv = lp_refine(idx_test, idx_train, labels, output, adj) lp_r2_test_raw_conv = compute_r2(lp_output_raw_conv, labels[idx_test]) print("------------") print("election year {}".format(meta)) print("loss:", loss.item()) print("raw_r2:", r2_test) print("refined_r2:", lp_r2_test) print("refined_r2_raw_conv:", lp_r2_test_raw_conv) print("------------")
def prepare_for_pytorch(self): self.edge_index_lists = [0] * len(self.graphs) for i in range(len(self.graphs)): self.edge_index_lists[i] = adj2edgeindex(self.graphs[i]) for i in self.layer2pooling_matrices: self.layer2pooling_matrices[i] = [ sparse_mx_to_torch_sparse_tensor(spmat).t() for spmat in self.layer2pooling_matrices[i] ]
def run(args, seed): setup_seed(seed) adj, features, labels, idx_train, idx_val, idx_test = load_data(args['dataset']) node_num = features.size()[0] class_num = labels.numpy().max() + 1 adj = adj.cuda() features = features.cuda() labels = labels.cuda() loss_func = nn.CrossEntropyLoss() loss_func_ss = nn.L1Loss() early_stopping = 10 adj_raw = load_adj_raw(args['dataset']).tocsr() idx_mask = list(range(node_num)) adj_mask = adj_raw adj_mask[idx_mask, idx_mask] = 0 adj_mask = sparse_mx_to_torch_sparse_tensor(normalize_adj(adj_mask)).cuda() reduced_dim = args['reduced_dimension'] ss_labels, _, _ = features.svd() ss_labels = ss_labels[:, :reduced_dim].cuda() net_gcn = net.net_gcn_multitask(embedding_dim=args['embedding_dim'], ss_dim=args['reduced_dimension']) net_gcn = net_gcn.cuda() optimizer = torch.optim.Adam(net_gcn.parameters(), lr=args['lr'], weight_decay=args['weight_decay']) best_val = 0 best_val_test = 0 for epoch in range(500): optimizer.zero_grad() output, _ = net_gcn(features, adj) _, output_ss = net_gcn(features, adj_mask) loss_target = loss_func(output[idx_train], labels[idx_train]) loss_ss = loss_func_ss(output_ss, ss_labels) * 1e2 loss = loss_target + loss_ss * args['loss_weight'] # print('epoch', epoch, 'loss', loss_target.data) loss.backward() optimizer.step() # validation with torch.no_grad(): output, _ = net_gcn(features, adj, val_test=True) # loss_val.append(loss_func(output[idx_val], labels[idx_val]).cpu().numpy()) # print('val acc', f1_score(labels[idx_val].cpu().numpy(), output[idx_val].cpu().numpy().argmax(axis=1), average='micro')) acc_val = f1_score(labels[idx_val].cpu().numpy(), output[idx_val].cpu().numpy().argmax(axis=1), average='micro') acc_test = f1_score(labels[idx_test].cpu().numpy(), output[idx_test].cpu().numpy().argmax(axis=1), average='micro') if acc_val > best_val: best_val = acc_val best_val_test = acc_test return best_val, best_val_test
def forward(self, input, adj): # adj is extracted from the graph structure support = torch.mm(input, self.weight) I_n = sp.eye(adj.shape[0]) I_n = sparse_mx_to_torch_sparse_tensor(I_n).cuda() output = torch.spmm((I_n + self.smooth * adj) / (1 + self.smooth), support) if self.bias is not None: return output + self.bias else: return output
def build_adj_mat(self, mode='vanilla-clean'): if mode == 'vanilla-clean': adj = self.build_adj_original() elif mode == 'vanilla': adj = self.build_adj_vanilla() else: raise NotImplementedError('mode = {} not implemented!'.format(mode)) adj = normalize(adj + sp.eye(adj.shape[0])) adj = sparse_mx_to_torch_sparse_tensor(adj) if mode == 'vanilla-clean' else torch.FloatTensor(adj) return adj
def __init__(self, adj_mat, train_nodes, valid_nodes, test_nodes, device): self.adj_mat = adj_mat self.train_nodes = train_nodes self.valid_nodes = valid_nodes self.test_nodes = test_nodes self.device = device self.num_nodes = adj_mat.shape[0] self.num_train_nodes = len(self.train_nodes) self.lap_matrix = self.sym_normalize(adj_mat) self.lap_tensor = sparse_mx_to_torch_sparse_tensor(self.lap_matrix) self.lap_tensor = torch.sparse.FloatTensor( self.lap_tensor[0], self.lap_tensor[1], self.lap_tensor[2]).to(device)
def load_data(path, name='BlogCatalog', exp_id='0'): data = sio.loadmat(path + name + exp_id + '.mat') C_list = data['T'] Y1_true_list = data['Y1'] Y0_true_list = data['Y0'] idx_trn = data['trn_idx'][0] idx_val = data['val_idx'][0] idx_tst = data['tst_idx'][0] # load X = data['X'][0] Z_init = torch.zeros(X[0].shape[0], args.h_dim) X_list = [] for t in range(len(X)): n_x = len(X) xt = X[t] X_list.append(torch.tensor(X[t].todense(), dtype=torch.float32)) # A sparse_A_list = [] dense_A_list = [] A = data['A'][0] for t in range(len(A)): dense_A_list.append(torch.tensor(A[t].todense())) A[t] = sp.csr_matrix(A[t]) A[t] = utils.sparse_mx_to_torch_sparse_tensor(A[t]) sparse_A_list.append(A[t]) C_list = [torch.FloatTensor(C) for C in C_list] Y1_true_list = [torch.FloatTensor(y1) for y1 in Y1_true_list] Y0_true_list = [torch.FloatTensor(y0) for y0 in Y0_true_list] idx_trn = torch.LongTensor(idx_trn) idx_val = torch.LongTensor(idx_val) idx_tst = torch.LongTensor(idx_tst) idx_trn_list = [] idx_val_list = [] idx_tst_list = [] for t in range(len(A)): idx_trn_list.append(idx_trn) idx_val_list.append(idx_val) idx_tst_list.append(idx_tst) Z_init = torch.FloatTensor(Z_init) return X_list, sparse_A_list, dense_A_list, C_list, Y1_true_list, Y0_true_list, idx_trn_list, idx_val_list, idx_tst_list, Z_init
def prepare(i_exp): # Load data and init models X, A, T, Y1, Y0 = utils.load_data(args.path, name=args.dataset, original_X=False, exp_id=str(i_exp), extra_str=args.extrastr) n = X.shape[0] n_train = int(n * args.tr) n_test = int(n * 0.2) # n_valid = n_test idx = np.random.permutation(n) idx_train, idx_test, idx_val = idx[:n_train], idx[n_train:n_train + n_test], idx[n_train + n_test:] X = utils.normalize(X) #row-normalize # A = utils.normalize(A+sp.eye(n)) X = X.todense() X = Tensor(X) Y1 = Tensor(np.squeeze(Y1)) Y0 = Tensor(np.squeeze(Y0)) T = LongTensor(np.squeeze(T)) A = utils.sparse_mx_to_torch_sparse_tensor(A, cuda=args.cuda) # print(X.shape, Y1.shape, A.shape) idx_train = LongTensor(idx_train) idx_val = LongTensor(idx_val) idx_test = LongTensor(idx_test) # Model and optimizer model = GCN_DECONF(nfeat=X.shape[1], nhid=args.hidden, dropout=args.dropout, n_out=args.nout, n_in=args.nin, cuda=args.cuda) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return X, A, T, Y1, Y0, idx_train, idx_val, idx_test, model, optimizer
def get_mini_batch_dropedge(self, percent=0.8): nnz = self.adj_mat.nnz perm = np.random.permutation(nnz) preserve_nnz = int(nnz * percent) perm = np.sort(perm[:preserve_nnz]) # print(preserve_nnz, perm) adj_mat = self.adj_mat.tocoo() adj_mat = sp.coo_matrix( (adj_mat.data[perm], (adj_mat.row[perm], adj_mat.col[perm])), shape=adj_mat.shape) lap_matrix = self.sym_normalize(adj_mat) lap_tensor = sparse_mx_to_torch_sparse_tensor(lap_matrix) lap_tensor = torch.sparse.FloatTensor(lap_tensor[0], lap_tensor[1], lap_tensor[2]).to(self.device) return [self.train_nodes], lap_tensor
def test(model, test_adj, test_feats, test_labels, batch_size, epoch): t = time.time() # change data type to tensor test_adj = [ sparse_mx_to_torch_sparse_tensor(cur_adj) for cur_adj in test_adj ] test_feats = [torch.FloatTensor(cur_feats) for cur_feats in test_feats] test_labels = torch.LongTensor(test_labels).max(1)[1] model.eval() outputs = model(test_feats, test_adj) loss_test = F.nll_loss(outputs, test_labels) acc_test = accuracy(outputs, test_labels) return loss_test.item(), acc_test.item(), time.time() - t
def sgc_precompute(self, adj, features, mode='sgc-clean'): # # if mode == 'sgc-clean': # adj = self.build_adj_original() # # else: # # adj = self.build_adj_vanilla() normalizer = fetch_normalization(self.args.norm) adj = sparse_mx_to_torch_sparse_tensor(normalizer(adj)).float().cuda() # adj_normalizer = fetch_normalization(self.args.normalization) # adj = adj_normalizer(adj) # adj = sparse_mx_to_torch_sparse_tensor(adj).float().cuda() # for _ in range(self.args.degree): features = torch.spmm(adj, features) return features
def get_label_weights(opt, test_predictions, test_targets): adj = pickle.load( open( '/bigtemp/jjl5sw/ChromeGCN/data/' + args.cell_type + '/hic/' + 'test' + '_graphs_min1000_samples' + args.hicsize + '_' + args.hicnorm + 'norm.pkl', "rb")) # if not os.path.exists('/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/hic/test.pt'): # if True: # data_dict = torch.load('/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/train_valid_test.pt') # torch.save(data_dict['test'],'/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/test.pt') # torch.save(data_dict['dict'],'/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/src_tgt_dict.pt') # else: # test_data = torch.load('/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/test.pt') # data_dict = torch.load('/bigtemp/jjl5sw/ChromeGCN/data/'+args.cell_type+'/src_tgt_dict.pt') test_data = torch.load('/bigtemp/jjl5sw/ChromeGCN/data/' + args.cell_type + '/test.pt') chrom_index_dict = {} for idx, sample in enumerate(test_data['loc']): chrom = sample[0] if not chrom in chrom_index_dict: chrom_index_dict[chrom] = [] chrom_index_dict[chrom].append(idx) test_labels = torch.Tensor(test_data['tgt']) label_neighbor_count = torch.zeros(len(test_data['tgt'][0])) label_count = torch.zeros(len(test_data['tgt'][0])) for chrom in chrom_index_dict: chrom_indices = torch.Tensor(chrom_index_dict[chrom]).long() chrom_labels = torch.index_select(test_labels, 0, chrom_indices) chrom_adj = utils.sparse_mx_to_torch_sparse_tensor(adj[chrom].tocoo()) chrom_adj_d = chrom_adj.to_dense() chrom_adj_d[chrom_adj_d > 1] = 1 for idx, sample_labels in enumerate(chrom_labels): sample_labels_nz = sample_labels.nonzero() sample_neighbors = chrom_adj_d[idx].sum() label_neighbor_count[sample_labels_nz] += sample_neighbors label_count[sample_labels_nz] += 1 normalized_label_weights = label_neighbor_count.div(label_count) return normalized_label_weights
def perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b): test_lb2idxs, test_idx2lb = read_meta(cfg.test_data['label_path']) test_inst_num = len(test_idx2lb) model.eval() HEAD1.eval() HEAD_test1.eval() for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.test_data) features = torch.FloatTensor(dataset.features) adj = sparse_mx_to_torch_sparse_tensor(dataset.adj) labels = torch.LongTensor(dataset.gt_labels) if cfg.cuda: features = features.cuda() adj = adj.cuda() labels = labels.cuda() HEAD_test1 = HEAD_test1.cuda() test_data = [features, adj, labels] HEAD_test1.load_state_dict(HEAD1.state_dict(), False) with torch.no_grad(): output_feature = model(test_data) sum_acc = 0 patch_num = 10 patch_size = int(test_inst_num / patch_num) for i in range(patch_num): score = HEAD_test1(output_feature[pair_a[i * patch_size:(i + 1) * patch_size]], output_feature[pair_b[i * patch_size:(i + 1) * patch_size]], no_list=True) #print(score) pre_labels = (score > 0.5).long() #print(pre_labels) gt_labels = (labels[pair_a[i * patch_size:(i + 1) * patch_size]] == labels[pair_b[i * patch_size:(i + 1) * patch_size]]).long() acc = (pre_labels == gt_labels).long().sum() sum_acc += acc avg_acc = float(sum_acc) / test_inst_num return avg_acc
def full_citation(dataset_str="cora"): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i, name in enumerate(names): with open("dataset/ind.{}.{}".format(dataset_str, name), 'rb') as f: objects.append(pkl.load(f, encoding="latin1")) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == "citeseer": # For this dataset, there are some isolated nodes in the graph test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) features = normalize(features) # porting to pytorch features = torch.FloatTensor(np.array(features.todense())).float() labels = torch.LongTensor(labels) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) adj = sys_normalized_adjacency(adj) adj = sparse_mx_to_torch_sparse_tensor(adj) return adj, features, labels, idx_train, idx_val, idx_test
def load_ABIDE(graph_type): atlas = "ho" connectivity = "correlation" # Get class labels subject_IDs = get_ids() labels = get_subject_score(subject_IDs, score="DX_GROUP") labels = np.array(list(map(int, list(labels.values())))) - 1 num_nodes = len(subject_IDs) # Compute feature vectors (vectorised connectivity networks) features = get_networks(subject_IDs, kind=connectivity, atlas_name=atlas) # Compute population graph using phenotypic features if graph_type == "original": final_graph = create_weighted_adjacency() if graph_type == "graph_no_features": final_graph = create_weighted_adjacency() features = np.identity(num_nodes) if graph_type == "graph_random": ones = get_num_edges() / (len(labels) * len(labels)) final_graph = np.random.choice([0, 1], size=(len(labels), len(labels)), p=[1 - ones, ones]) final_graph = (final_graph + final_graph.T) / 2 if graph_type == "graph_identity": final_graph = np.zeros((num_nodes, num_nodes)) final_graph = normalize(final_graph) adj = sp.coo_matrix(final_graph) adj = adj + sp.eye(adj.shape[0]) adj = normalize(adj) features = sp.csr_matrix(features) features = normalize(features) # Convert to tensors adj = sparse_mx_to_torch_sparse_tensor(adj) features = torch.FloatTensor(np.array(features.todense())).float() labels = torch.LongTensor(labels) return adj, features, labels
def load_trained_vector(epoch, number, n2i_f, file_homes): global node2index node2index = cPickle.load(n2i_f) node_count = len(node2index) node_dim = 128 n_repr = 128 gcn = GCN(node_count, node_dim, n_repr) gcn.load_state_dict( torch.load(file_homes + '/networks/GCN_%d_%d.pth' % (number, epoch), map_location='cpu')) f = open(files_home + '/networks/adj_matrix_%d_full' % (number), 'rb') full_adj_matrix = cPickle.load(f) full_adj_matrix = sparse_mx_to_torch_sparse_tensor(full_adj_matrix) init_input = torch.LongTensor([j for j in range(0, node_count)]) gcn.eval() rp_matrix = gcn(init_input, full_adj_matrix) #gcn.to(device) return rp_matrix.double()
def get_date_adj_list(self, origin_base_path, start_idx, duration, sep='\t', normalize=False, row_norm=False, add_eye=False, data_type='tensor'): assert data_type in ['tensor', 'matrix'] date_dir_list = sorted(os.listdir(origin_base_path)) # print('adj list: ', date_dir_list) date_adj_list = [] for i in range(start_idx, min(start_idx + duration, self.max_time_num)): original_graph_path = os.path.join(origin_base_path, date_dir_list[i]) spmat = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep) # spmat = sp.coo_matrix((np.exp(alpha * spmat.data), (spmat.row, spmat.col)), shape=(self.node_num, self.node_num)) if add_eye: spmat = spmat + sp.eye(spmat.shape[0]) if normalize: spmat = get_normalized_adj(spmat, row_norm=row_norm) # data type if data_type == 'tensor': sptensor = sparse_mx_to_torch_sparse_tensor(spmat) date_adj_list.append(sptensor.cuda() if self.has_cuda else sptensor) else: # data_type == matrix date_adj_list.append(spmat) # print(len(date_adj_list)) return date_adj_list
def build_adj_mat(self, edges, mode='vanilla-clean'): if mode in ( 'vanilla-clean', 'degcn-clean' ): adj = self.build_adj_original(edges) elif mode in ( 'vanilla', 'degcn' ): adj = self.build_adj_vanilla() if mode == 'degcn': # temp = np.zeros((self.n_nodes, self.n_nodes)) # temp[adj > 0.5] = 1 # adj = temp # print(len(self.edges)) self.edges = [] for u, v in zip(*np.where(adj)): if u > v: continue self.edges.append((u, v)) print(len(self.edges)) adj = normalize(adj + sp.eye(adj.shape[0])) adj = sparse_mx_to_torch_sparse_tensor(adj) if mode in ( 'vanilla-clean', 'degcn-clean' ) else torch.FloatTensor(adj) return adj
def load_reg_data(args): path = './data/county/election/2012' adj = np.load(path + "/A.npy") labels = np.load(path + "/labels.npy") features = np.load(path + "/feats.npy") idx_train = np.load(path + "/train_idx.npy") - 1 idx_val = np.load(path + "/val_idx.npy") - 1 idx_test = np.load(path + "/test_idx.npy") - 1 n = len(adj) train_mask = np.zeros(n).astype(bool) train_mask[idx_train] = True val_mask = np.zeros(n).astype(bool) val_mask[idx_val] = True test_mask = np.zeros(n).astype(bool) test_mask[idx_test] = True n_classes = 1 sp_adj = sp.coo_matrix(adj) g = dgl.graph((torch.LongTensor(sp_adj.row), torch.LongTensor(sp_adj.col))) lp_dict = { 'idx_test': torch.LongTensor(idx_test), 'idx_train': torch.LongTensor(idx_train), 'sp_adj': sp_adj.astype(float), 'adj': sparse_mx_to_torch_sparse_tensor(normalize(sp_adj.astype(float))) } features = torch.FloatTensor(features) labels = torch.FloatTensor(labels) train_mask = torch.BoolTensor(train_mask) val_mask = torch.BoolTensor(val_mask) test_mask = torch.BoolTensor(test_mask) path = './data/county/election/2016' ind_features = torch.FloatTensor(np.load(path + "/feats.npy")) ind_labels = torch.FloatTensor(np.load(path + "/labels.npy")) return g, features, labels, n_classes, train_mask, val_mask, test_mask, lp_dict, ind_features, ind_labels
def prepare_data(self): if self.mode in ( 'sgc-clean', 'sgc' ): if self.dataset in ( 'reddit', 'flickr', 'ppi', 'ppi-large', 'cora', 'citeseer', 'pubmed' ): self.features_train = self.sgc_precompute(self.adj_train, self.features_train, mode=self.mode) self.features = self.sgc_precompute(self.adj_full, self.features, mode=self.mode) self.adj = self.adj_train = None elif self.transfer: self.features_1 = self.sgc_precompute(self.adj_1, self.features_1, mode=self.mode) self.features_2 = self.sgc_precompute(self.adj_2, self.features_2, mode=self.mode) self.adj_1 = self.adj_2 = None else: raise NotImplementedError(f'dataset = {self.dataset} not implemented!') print('SGC Precomputing done!') elif self.mode in ( 'clusteradj', 'clusteradj-clean' ): self.generate_fake_labels() if self.args.break_down: self.break_down() self.prj = self.build_cluster_prj() self.adj = self.build_cluster_adj(fnormalize=self.args.fnormalize) elif self.mode in ( 'vanilla', 'vanilla-clean', 'cs' ): if self.dataset in ( 'reddit', 'flickr', 'ppi', 'ppi-large', 'cora', 'citeseer', 'pubmed' ) \ or self.dataset.startswith('twitch-train'): if self.mode == 'vanilla': self.adj_full = self.perturb_adj(self.adj_full, self.args.perturb_type) self.adj_train = self.perturb_adj(self.adj_train, self.args.perturb_type) print('perturbing done!') # normalize adjacency matrix if self.dataset not in ( 'cora', 'citeseer', 'pubmed' ): normalizer = fetch_normalization(self.args.norm) self.adj_train = normalizer(self.adj_train) self.adj_full = normalizer(self.adj_full) self.adj_train = sparse_mx_to_torch_sparse_tensor(self.adj_train) self.adj_full = sparse_mx_to_torch_sparse_tensor(self.adj_full) elif self.transfer: if self.mode == 'vanilla': self.adj_1 = self.perturb_adj(self.adj_1, self.args.perturb_type) self.adj_2 = self.perturb_adj(self.adj_2, self.args.perturb_type) print('perturbing done!') elif self.mode == 'cs': self.adj_1 = compressive_sensing(self.args, self.adj_1) self.adj_2 = compressive_sensing(self.args, self.adj_2) print('compressive sensing done!') # normalize adjacency matrix normalizer = fetch_normalization(self.args.norm) self.adj_1 = sparse_mx_to_torch_sparse_tensor(normalizer(self.adj_1)) self.adj_2 = sparse_mx_to_torch_sparse_tensor(normalizer(self.adj_2)) else: # self.adj = self.build_adj_mat(self.edges, mode=self.mode) raise NotImplementedError(f'dataset = {self.dataset} not implemented!') print('Normalizing Adj done!') elif self.mode in ( 'degree_mlp', 'basic_mlp' ): self.adj = None elif self.mode in ( 'degcn', 'degcn-clean' ): self.adj = self.build_adj_mat(self.edges, mode=self.mode) self.decompose_graph() else: raise NotImplementedError('mode = {} not implemented!'.format(self.mode)) # self.calculate_connectivity() if torch.cuda.is_available(): if hasattr(self, 'adj') and self.adj is not None: self.adj = self.adj.cuda() if hasattr(self, 'adj_train') and self.adj_train is not None: self.adj_train = self.adj_train.cuda() self.adj_full = self.adj_full.cuda() if hasattr(self, 'adj_1') and self.adj_1 is not None: self.adj_1 = self.adj_1.cuda() self.adj_2 = self.adj_2.cuda() if hasattr(self, 'prj'): self.prj = self.prj.cuda() if hasattr(self, 'sub_adj'): for i in range(len(self.sub_adj)): self.sub_adj[i] = self.sub_adj[i].cuda()
def full_load_data(dataset_name, splits_file_path=None): if dataset_name in {'cora', 'citeseer', 'pubmed'}: adj, features, labels, _, _, _ = full_load_citation(dataset_name) labels = np.argmax(labels, axis=-1) features = features.todense() G = nx.DiGraph(adj) else: graph_adjacency_list_file_path = os.path.join('new_data', dataset_name, 'out1_graph_edges.txt') graph_node_features_and_labels_file_path = os.path.join('new_data', dataset_name, 'out1_node_feature_label.txt') G = nx.DiGraph() graph_node_features_dict = {} graph_labels_dict = {} with open(graph_node_features_and_labels_file_path) as graph_node_features_and_labels_file: graph_node_features_and_labels_file.readline() for line in graph_node_features_and_labels_file: line = line.rstrip().split('\t') assert (len(line) == 3) assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict) graph_node_features_dict[int(line[0])] = np.array(line[1].split(','), dtype=np.uint8) graph_labels_dict[int(line[0])] = int(line[2]) with open(graph_adjacency_list_file_path) as graph_adjacency_list_file: graph_adjacency_list_file.readline() for line in graph_adjacency_list_file: line = line.rstrip().split('\t') assert (len(line) == 2) if int(line[0]) not in G: G.add_node(int(line[0]), features=graph_node_features_dict[int(line[0])], label=graph_labels_dict[int(line[0])]) if int(line[1]) not in G: G.add_node(int(line[1]), features=graph_node_features_dict[int(line[1])], label=graph_labels_dict[int(line[1])]) G.add_edge(int(line[0]), int(line[1])) adj = nx.adjacency_matrix(G, sorted(G.nodes())) features = np.array( [features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0])]) labels = np.array( [label for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])]) features = preprocess_features(features) g = adj with np.load(splits_file_path) as splits_file: train_mask = splits_file['train_mask'] val_mask = splits_file['val_mask'] test_mask = splits_file['test_mask'] num_features = features.shape[1] num_labels = len(np.unique(labels)) assert (np.array_equal(np.unique(labels), np.arange(len(np.unique(labels))))) features = th.FloatTensor(features) labels = th.LongTensor(labels) train_mask = th.BoolTensor(train_mask) val_mask = th.BoolTensor(val_mask) test_mask = th.BoolTensor(test_mask) g = sys_normalized_adjacency(g) g = sparse_mx_to_torch_sparse_tensor(g) return g, features, labels, train_mask, val_mask, test_mask, num_features, num_labels