def load(dataset): datadir = os.path.join('data', dataset) if not os.path.exists(datadir): os.makedirs(datadir) ds = download(dataset) adj = nx.to_numpy_array(ds.graph) diff = compute_ppr(ds.graph, 0.2) feat = ds.features[:] labels = ds.labels[:] idx_train = np.argwhere(ds.train_mask == 1).reshape(-1) idx_val = np.argwhere(ds.val_mask == 1).reshape(-1) idx_test = np.argwhere(ds.test_mask == 1).reshape(-1) np.save(f'{datadir}/adj.npy', adj) np.save(f'{datadir}/diff.npy', diff) np.save(f'{datadir}/feat.npy', feat) np.save(f'{datadir}/labels.npy', labels) np.save(f'{datadir}/idx_train.npy', idx_train) np.save(f'{datadir}/idx_val.npy', idx_val) np.save(f'{datadir}/idx_test.npy', idx_test) else: adj = np.load(f'{datadir}/adj.npy') diff = np.load(f'{datadir}/diff.npy') feat = np.load(f'{datadir}/feat.npy') labels = np.load(f'{datadir}/labels.npy') idx_train = np.load(f'{datadir}/idx_train.npy') idx_val = np.load(f'{datadir}/idx_val.npy') idx_test = np.load(f'{datadir}/idx_test.npy') if dataset == 'citeseer': feat = preprocess_features(feat) epsilons = [1e-5, 1e-4, 1e-3, 1e-2] avg_degree = np.sum(adj) / adj.shape[0] epsilon = epsilons[np.argmin([ abs(avg_degree - np.argwhere(diff >= e).shape[0] / diff.shape[0]) for e in epsilons ])] diff[diff < epsilon] = 0.0 scaler = MinMaxScaler() scaler.fit(diff) diff = scaler.transform(diff) ori_adj = copy.deepcopy(adj) # print(ori_adj) adj = normalize_adj(adj + sp.eye(adj.shape[0])).todense() return ori_adj, adj, diff, feat, labels, idx_train, idx_val, idx_test
def process_raw_data(self): root = os.path.abspath( os.path.join(os.path.dirname(__file__), os.path.pardir)) prefix = os.path.join(root, 'dataset', self.__graph_type, self.__name, self.__name) print(prefix) graph_node_dict = {} with open('{0}_graph_indicator.txt'.format(prefix), 'r') as f: for idx, line in enumerate(f): graph_node_dict[idx + 1] = int(line.strip('\n')) max_nodes = Counter(graph_node_dict.values()).most_common(1)[0][1] node_labels = [] if os.path.exists('{0}_node_labels.txt'.format(prefix)): with open('{0}_node_labels.txt'.format(prefix), 'r') as f: for line in f: node_labels += [int(line.strip('\n'))] # -1 ??????????? num_unique_node_labels = max(node_labels) + 1 # ?????????????? else: print('No node labels') node_attrs = [] if os.path.exists('{0}_node_attributes.txt'.format(prefix)): with open('{0}_node_attributes.txt'.format(prefix), 'r') as f: for line in f: node_attrs.append( np.array([ float(attr) for attr in re.split("[,\s]+", line.strip('\s\n')) if attr ], dtype=np.float)) else: print('No node attributes') graph_labels = [] unique_labels = set() with open('{0}_graph_labels.txt'.format(prefix), 'r') as f: for line in f: val = int(line.strip('\n')) if val not in unique_labels: unique_labels.add(val) graph_labels.append(val) label_idx_dict = {val: idx for idx, val in enumerate(unique_labels)} graph_labels = np.array( [label_idx_dict[element] for element in graph_labels]) adj_list = {idx: [] for idx in range(1, len(graph_labels) + 1)} # 每个图包含的边 index_graph = {idx: [] for idx in range(1, len(graph_labels) + 1)} # 每个图包含的节点 with open('{0}_A.txt'.format(prefix), 'r') as f: for line in f: u, v = tuple(map(int, line.strip('\n').split(','))) adj_list[graph_node_dict[u]].append((u, v)) index_graph[graph_node_dict[u]] += [u, v] for k in index_graph.keys(): index_graph[k] = [u - 1 for u in set(index_graph[k])] graphs, aug_view = [], [] for idx in range(1, 1 + len(adj_list)): graph = nx.from_edgelist(adj_list[idx]) if max_nodes is not None and graph.number_of_nodes() > max_nodes: continue graph.graph['label'] = graph_labels[idx - 1] for u in graph.nodes(): if len(node_labels) > 0: node_label_one_hot = [0] * num_unique_node_labels node_label = node_labels[u - 1] node_label_one_hot[node_label] = 1 graph.nodes[u]['label'] = node_label_one_hot if len(node_attrs) > 0: graph.nodes[u]['feat'] = node_attrs[u - 1] if len(node_attrs) > 0: graph.graph['feat_dim'] = node_attrs[0].shape[0] # relabeling mapping = {} for node_idx, node in enumerate(graph.nodes()): mapping[node] = node_idx graphs.append(nx.relabel_nodes(graph, mapping)) aug_view.append(compute_ppr(graph, alpha=0.2)) if 'feat_dim' in graphs[0].graph: pass else: max_deg = max( [max(dict(graph.degree).values()) for graph in graphs]) for graph in graphs: for u in graph.nodes(data=True): f = np.zeros(max_deg + 1) f[graph.degree[u[0]]] = 1.0 if 'label' in u[1]: f = np.concatenate((np.array(u[1]['label'], dtype=np.float), f)) graph.nodes[u[0]]['feat'] = f return graphs, aug_view
return 0 if __name__ == '__main__': import warnings warnings.filterwarnings("ignore") a = 1 # print(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) ds = CitationGraphDataset(name='pubmed') # nx.draw(ds.graph, node_size=10) # plt.show() # print(nx.to_numpy_array(ds.graph).shape) print('start') start = time.time() # spectral_perturbation(ds.graph) compute_ppr(ds.graph, 0.2) finish = time.time() print('finish ', finish - start) # print(ds) # print(nx.to_numpy_array(ds.graph).shape) # print(n) # print(ds.features) # path = 'D:/VSrepos/spectral-graph-augmentation/dataset/node' # if not os.path.exists(path): # print('create') # os.makedirs(path) # else: # print('exist') # GraphData('PTC_MR').process_raw_data()