def prepare_data(self): dataset = MAG240MDataset(self.data_dir) edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index, sorted=True) graph.adj_dst_index graph.dump(edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') self.x = dataset.paper_feat self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_institutions) author_path = f'{dataset.dir}/author_feat_year.npy' path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): log.info('get institution_feat...') author_feat = np.memmap(author_path, dtype=np.int32, mode='r', shape=(dataset.num_authors, )) author_feat = author_feat[:] author_feat = np.expand_dims(author_feat, axis=1) # author edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T log.info(edge_index.shape) institution_graph = Graph(edge_index, num_nodes=dataset.num_institutions) institution_graph.tensor() log.info('finish institution graph') institution_x = np.memmap(path, dtype=np.int32, mode='w+', shape=(dataset.num_institutions, )) degree = paddle.zeros(shape=[dataset.num_institutions, 1], dtype='float32') temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, overwrite=False, index=institution_graph.edges[:, 1], updates=temp_one) log.info('finish degree') inputs = author_feat inputs = paddle.to_tensor(inputs, dtype='float32') outputs = institution_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('int32').numpy() del inputs save_col_slice(x_src=outputs, x_dst=institution_x, start_row_idx=0, end_row_idx=dataset.num_institutions) del outputs institution_x.flush() del institution_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def setUp(self): num_nodes = 5 edges = [(0, 1), (1, 2), (3, 4)] feature = np.random.randn(5, 100) edge_feature = np.random.randn(3, 100) self.graph = Graph(num_nodes=num_nodes, edges=edges, node_feat={"feature": feature}, edge_feat={"edge_feature": edge_feature})
def __init__(self, edges, num_nodes=None, node_types=None, node_feat=None, edge_feat=None, **kwargs): self._edges_dict = edges if isinstance(node_types, list): self._node_types = np.array(node_types, dtype=object)[:, 1] else: self._node_types = node_types if num_nodes is None: self._num_nodes = len(node_types) else: self._num_nodes = num_nodes self._nodes_type_dict = {} for ntype in np.unique(self._node_types): self._nodes_type_dict[ntype] = np.where( self._node_types == ntype)[0] if node_feat is not None: self._node_feat = node_feat else: self._node_feat = {} if edge_feat is not None: self._edge_feat = edge_feat else: self._edge_feat = {} if "multi_graph" in kwargs.keys(): self._multi_graph = kwargs["multi_graph"] else: self._multi_graph = {} for etype, _edges in self._edges_dict.items(): if not self._edge_feat: edge_feat = None else: edge_feat = self._edge_feat[etype] self._multi_graph[etype] = Graph(edges=_edges, num_nodes=self._num_nodes, node_feat=copy.deepcopy( self._node_feat), edge_feat=edge_feat) self._edge_types = self.edge_types_info() self._nodes = None for etype, g in self._multi_graph.items(): if g.is_tensor(): self._is_tensor = True else: self._is_tensor = False break
def load(cls, path, mmap_mode="r"): """Load HeterGraph from path and return a HeterGraph instance in numpy. Args: path: The directory path of the stored HeterGraph. mmap_mode: Default :code:`mmap_mode="r"`. If not None, memory-map the graph. """ _node_types = np.load(os.path.join(path, "node_types.npy"), allow_pickle=True) with open(os.path.join(path, "edge_types.pkl"), "rb") as f: _edge_types = pkl.load(f) _multi_graph = {} for etype in _edge_types: sub_path = os.path.join(path, etype) _multi_graph[etype] = Graph.load(sub_path, mmap_mode) return cls( edges=None, node_types=_node_types, multi_graph=_multi_graph, )
def subgraph(graph, nodes, eid=None, edges=None, with_node_feat=True, with_edge_feat=True): """Generate subgraph with nodes and edge ids. This function will generate a :code:`pgl.graph.Subgraph` object and copy all corresponding node and edge features. Nodes and edges will be reindex from 0. Eid and edges can't both be None. WARNING: ALL NODES IN EID MUST BE INCLUDED BY NODES Args: nodes: Node ids which will be included in the subgraph. eid (optional): Edge ids which will be included in the subgraph. edges (optional): Edge(src, dst) list which will be included in the subgraph. with_node_feat: Whether to inherit node features from parent graph. with_edge_feat: Whether to inherit edge features from parent graph. Return: A :code:`pgl.Graph` object. """ assert not graph.is_tensor(), "You must call Graph.numpy() first." if eid is None and edges is None: raise ValueError("Eid and edges can't be None at the same time.") reindex = {} for ind, node in enumerate(nodes): reindex[node] = ind sub_edge_feat = {} if edges is None: edges = graph._edges[eid] else: edges = np.array(edges, dtype="int64") if with_edge_feat: for key, value in graph._edge_feat.items(): if eid is None: raise ValueError("Eid can not be None with edge features.") sub_edge_feat[key] = value[eid] sub_edges = pgl.graph_kernel.map_edges( np.arange(len(edges), dtype="int64"), edges, reindex) sub_node_feat = {} if with_node_feat: for key, value in graph._node_feat.items(): sub_node_feat[key] = value[nodes] g = Graph(edges=sub_edges, num_nodes=len(nodes), node_feat=sub_node_feat, edge_feat=sub_edge_feat) return g
def build_graph(num_nodes, edge_path): filelist = [] if os.path.isfile(edge_path): filelist = [edge_path] elif os.path.isdir(edge_path): filelist = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(edge_path) for f in filenames ] else: raise ValueError(edge_path + " not supported") edges, edge_weight = [], [] for name in filelist: with open(name) as inf: for line in inf: slots = line.strip("\n").split() edges.append([slots[0], slots[1]]) edges.append([slots[1], slots[0]]) if len(slots) > 2: edge_weight.extend([float(slots[2]), float(slots[2])]) edges = np.array(edges, dtype="int64") assert num_nodes > edges.max( ), "Node id in any edges should be smaller then num_nodes!" edge_feat = dict() if len(edge_weight) == len(edges): edge_feat["weight"] = np.array(edge_weight) graph = Graph(num_nodes, edges, edge_feat=edge_feat) log.info("Build graph done") graph.outdegree() del edges, edge_feat log.info("Build graph index done") if "weight" in graph.edge_feat: graph.node_feat["alias"], graph.node_feat[ "events"] = graph_alias_sample_table(graph, "weight") log.info("Build graph alias sample table done") return graph
def _load_data(self): """Load data""" content = os.path.join(self.path, 'cora.content') cite = os.path.join(self.path, 'cora.cites') node_feature = [] paper_ids = [] y = [] y_dict = {} with open(content, 'r') as f: for line in f: line = line.strip().split() paper_id = int(line[0]) paper_class = line[-1] if paper_class not in y_dict: y_dict[paper_class] = len(y_dict) feature = [int(i) for i in line[1:-1]] feature_array = np.array(feature, dtype="float32") # Normalize feature_array = feature_array / (np.sum(feature_array) + 1e-15) node_feature.append(feature_array) y.append(y_dict[paper_class]) paper_ids.append(paper_id) paper2vid = dict([(v, k) for (k, v) in enumerate(paper_ids)]) num_nodes = len(paper_ids) node_feature = np.array(node_feature, dtype="float32") all_edges = [] with open(cite, 'r') as f: for line in f: u, v = line.split() u = paper2vid[int(u)] v = paper2vid[int(v)] all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = Graph(num_nodes=num_nodes, edges=all_edges, node_feat={"words": node_feature}) perm = np.arange(0, num_nodes) #np.random.shuffle(perm) self.train_index = perm[:140] self.val_index = perm[200:500] self.test_index = perm[500:1500] self.y = np.array(y, dtype="int64") self.num_classes = len(y_dict)
def _load_data(self): np.random.seed(self.np_random_seed) edge_path = os.path.join(self.path, 'ca-AstroPh.txt') bi_edges = set() self.neg_edges = [] self.pos_edges = [] self.node2id = dict() def node_id(node): if node not in self.node2id: self.node2id[node] = len(self.node2id) return self.node2id[node] with io.open(edge_path) as inf: for _ in range(4): inf.readline() for line in inf: u, v = line.strip('\n').split('\t') u, v = node_id(u), node_id(v) if u < v: bi_edges.add((u, v)) else: bi_edges.add((v, u)) num_nodes = len(self.node2id) while len(self.neg_edges) < len(bi_edges) // 2: random_edges = np.random.choice(num_nodes, [len(bi_edges), 2]) for (u, v) in random_edges: if u != v and (u, v) not in bi_edges and (v, u) not in bi_edges: self.neg_edges.append((u, v)) if len(self.neg_edges) == len(bi_edges) // 2: break bi_edges = list(bi_edges) np.random.shuffle(bi_edges) self.pos_edges = bi_edges[:len(bi_edges) // 2] bi_edges = bi_edges[len(bi_edges) // 2:] all_edges = [] for edge in bi_edges: u, v = edge all_edges.append((u, v)) all_edges.append((v, u)) self.graph = Graph(num_nodes=num_nodes, edges=all_edges)
def __init__(self, num_nodes, edges, node_types=None, node_feat=None, edge_feat=None): self._num_nodes = num_nodes self._edges_dict = edges if isinstance(node_types, list): self._node_types = np.array(node_types, dtype=object)[:, 1] else: self._node_types = node_types self._nodes_type_dict = {} for n_type in np.unique(self._node_types): self._nodes_type_dict[n_type] = np.where( self._node_types == n_type)[0] if node_feat is not None: self._node_feat = node_feat else: self._node_feat = {} if edge_feat is not None: self._edge_feat = edge_feat else: self._edge_feat = {} self._multi_graph = {} for key, value in self._edges_dict.items(): if not self._edge_feat: edge_feat = None else: edge_feat = self._edge_feat[key] self._multi_graph[key] = Graph(num_nodes=self._num_nodes, edges=value, node_feat=self._node_feat, edge_feat=edge_feat) self._edge_types = self.edge_types_info()
def build_graph(self, x_batch): """build graph""" B, T, n, _ = x_batch.shape batch = B * T batch_edges = [] for i in range(batch): batch_edges.append(self.edges + (i * n)) batch_edges = np.vstack(batch_edges) num_nodes = B * T * n node_feat = {'norm': np.tile(self.norm, [batch, 1])} edge_feat = {'weights': np.tile(self.weights, [batch, 1])} graph = Graph(num_nodes=num_nodes, edges=batch_edges, node_feat=node_feat, edge_feat=edge_feat) return graph
def add_self_loop(graph, sub_nodes=None): '''add_self_loop_for_subgraph ''' assert not graph.is_tensor(), "You must call Graph.numpy() first." if sub_nodes is not None: self_loop_edges = np.zeros((sub_nodes.shape[0], 2)) self_loop_edges[:, 0] = self_loop_edges[:, 1] = sub_nodes else: self_loop_edges = np.zeros((graph.num_nodes, 2)) self_loop_edges[:, 0] = self_loop_edges[:, 1] = np.arange(graph.num_nodes) edges = np.vstack((graph.edges, self_loop_edges)) edges = np.unique(edges, axis=0) new_g = Graph( edges=edges, num_nodes=graph.num_nodes, ) return new_g
def _load_data(self): edge_path = os.path.join(self.path, 'edges.csv') node_path = os.path.join(self.path, 'nodes.csv') group_edge_path = os.path.join(self.path, 'group-edges.csv') all_edges = [] with io.open(node_path) as inf: num_nodes = len(inf.readlines()) node_feature = np.zeros((num_nodes, self.num_groups)) with io.open(group_edge_path) as inf: for line in inf: node_id, group_id = line.strip('\n').split(',') node_id, group_id = int(node_id) - 1, int(group_id) - 1 node_feature[node_id][group_id] = 1 with io.open(edge_path) as inf: for line in inf: u, v = line.strip('\n').split(',') u, v = int(u) - 1, int(v) - 1 all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(num_nodes): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = Graph(num_nodes=num_nodes, edges=all_edges, node_feat={"group_id": node_feature}) perm = np.arange(0, num_nodes) np.random.shuffle(perm) train_num = int(num_nodes * 0.5) self.train_index = perm[:train_num] self.test_index = perm[train_num:]
def _load_data(self, normalize=True, symmetry=True): from sklearn.preprocessing import StandardScaler import scipy.sparse as sp data = np.load(os.path.join(self.path, "reddit.npz")) adj = sp.load_npz(os.path.join(self.path, "reddit_adj.npz")) if symmetry: adj = adj + adj.T adj = adj.tocoo() src = adj.row dst = adj.col num_classes = 41 train_label = data['y_train'] val_label = data['y_val'] test_label = data['y_test'] train_index = data['train_index'] val_index = data['val_index'] test_index = data['test_index'] feature = data["feats"].astype("float32") if normalize: scaler = StandardScaler() scaler.fit(feature[train_index]) feature = scaler.transform(feature) graph = Graph(num_nodes=feature.shape[0], edges=list(zip(src, dst))) self.graph = graph self.train_index = train_index self.train_label = train_label self.val_label = val_label self.val_index = val_index self.test_index = test_index self.test_label = test_label self.feature = feature self.num_classes = 41
class GraphTest(unittest.TestCase): def setUp(self): num_nodes = 5 edges = [(0, 1), (1, 2), (3, 4)] feature = np.random.randn(5, 100) edge_feature = np.random.randn(3, 100) self.graph = Graph(num_nodes=num_nodes, edges=edges, node_feat={"feature": feature}, edge_feat={"edge_feature": edge_feature}) def test_subgraph_consistency(self): node_index = [0, 2, 3, 4] eid = [2] subgraph = self.graph.subgraph(node_index, eid) for key, value in subgraph.node_feat.items(): diff = value - self.graph.node_feat[key][node_index] diff = np.sqrt(np.sum(diff * diff)) self.assertLessEqual(diff, 1e-6) for key, value in subgraph.edge_feat.items(): diff = value - self.graph.edge_feat[key][eid] diff = np.sqrt(np.sum(diff * diff)) self.assertLessEqual(diff, 1e-6)
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_papers) path = f'{dataset.dir}/author_feat.npy' t = time.perf_counter() if not osp.exists(path): log.info('get author_feat...') paper_feat = dataset.paper_feat # author edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] edge_index = np.stack([col, row], axis=1) log.info(edge_index.shape) author_graph = Graph(edge_index, num_nodes=dataset.num_authors) author_graph.tensor() log.info('finish author graph') author_x = np.memmap(path, dtype=np.float16, mode='w+', shape=(dataset.num_authors, self.num_features)) dim_chunk_size = 64 degree = paddle.zeros(shape=[dataset.num_authors, 1], dtype='float32') degree += 1e-10 temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, author_graph.edges[:, 1], temp_one, overwrite=False) log.info('finish degree') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = paddle.to_tensor(inputs, dtype='float32') outputs = author_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('float16').numpy() del inputs save_col_slice(x_src=outputs, x_dst=author_x, start_row_idx=0, end_row_idx=dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs author_x.flush() del author_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') self.val_idx = dataset.get_idx_split('valid') valid_name = os.path.join(self.valid_path, self.valid_name) self.val_idx_cv = np.load(valid_name) log.info(self.train_idx.shape) log.info(self.val_idx.shape) log.info(self.val_idx_cv.shape) self.test_idx = dataset.get_idx_split('test') ##self.val_idx = np.load('valid_idx_eval.npy') def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def cal_angle(position, hid_idx): return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) def get_posi_angle_vec(position): return [cal_angle(position, hid_j) for hid_j in range(d_hid)] sinusoid_table = np.array( [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 return sinusoid_table N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}', dtype=np.float16, mode='r', shape=(N, self.m2v_dim)) self.y = dataset.all_paper_label self.graph = [ Graph.load(edge_path, mmap_mode='r+') for edge_path in graph_file_list ] self.pos = get_sinusoid_encoding_table(200, 768) #self.year = dataset.all_paper_year year_file = f'{dataset.dir}/all_feat_year.npy' self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, )) self.num_papers = dataset.num_papers self.train_idx_label = None self.train_idx_data = None log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def _load_data(self): """Load data """ import networkx as nx objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(objnames)): with open("{}/ind.{}.{}".format(self.path, self.name, objnames[i]), 'rb') as f: objects.append(_pickle_load(f)) x, y, tx, ty, allx, ally, _graph = objects test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format( self.path, self.name)) test_idx_range = np.sort(test_idx_reorder) allx = allx.todense() tx = tx.todense() if self.name == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = np.zeros((len(test_idx_range_full), x.shape[1]), dtype="float32") tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]), dtype="float32") ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = np.vstack([allx, tx]) features[test_idx_reorder, :] = features[test_idx_range, :] features = features / (np.sum(features, axis=-1) + 1e-15) features = np.array(features, dtype="float32") _graph = nx.DiGraph(nx.from_dict_of_lists(_graph)) onehot_labels = np.vstack((ally, ty)) onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :] labels = np.argmax(onehot_labels, 1) idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) all_edges = [] for i in _graph.edges(): u, v = tuple(i) all_edges.append((u, v)) if self.symmetry_edges: all_edges.append((v, u)) if self.self_loop: for i in range(_graph.number_of_nodes()): all_edges.append((i, i)) all_edges = list(set(all_edges)) self.graph = Graph(num_nodes=_graph.number_of_nodes(), edges=all_edges, node_feat={"words": features}) self.y = np.array(labels, dtype="int64") self.num_classes = onehot_labels.shape[1] self.train_index = np.array(idx_train, dtype="int32") self.val_index = np.array(idx_val, dtype="int32") self.test_index = np.array(idx_test, dtype="int32")
def build_graph(num_nodes, edge_path, output_path, undigraph=True): """ build_graph """ edge_file = os.path.join(output_path, "edge.npy") edge_weight_file = os.path.join(output_path, "edge_weight.npy") alias_file = os.path.join(output_path, "alias.npy") events_file = os.path.join(output_path, "events.npy") if os.path.isfile(edge_file): edges = np.load(edge_file) edge_feat = dict() if os.path.isfile(edge_weight_file): log.info("Loading weight from cache") edge_feat["weight"] = np.load(edge_weight_file, allow_pickle=True) node_feat = dict() if os.path.isfile(alias_file): log.info("Loading alias from cache") node_feat["alias"] = np.load(alias_file, allow_pickle=True) if os.path.isfile(events_file): log.info("Loading events from cache") node_feat["events"] = np.load(events_file, allow_pickle=True) else: filelist = get_file_list(edge_path) edges, edge_weight = [], [] log.info("Reading edge files") for name in filelist: with open(name) as inf: for line in inf: slots = line.strip("\n").split() edges.append([slots[0], slots[1]]) if len(slots) > 2: edge_weight.append(slots[2]) edges = np.array(edges, dtype="int64") assert num_nodes > edges.max( ), "Node id in any edges should be smaller then num_nodes!" log.info("Read edge files done.") edge_feat = dict() node_feat = dict() if len(edge_weight) == len(edges): edge_feat["weight"] = np.array(edge_weight, dtype="float32") if undigraph is True: edges = np.concatenate([edges, edges[:, [1, 0]]], 0) if "weight" in edge_feat: edge_feat["weight"] = np.concatenate( [edge_feat["weight"], edge_feat["weight"]], 0).astype("float64") graph = Graph(num_nodes, edges, node_feat, edge_feat=edge_feat) log.info("Build graph done") graph.outdegree() log.info("Build graph index done") if "weight" in graph.edge_feat and "alias" not in graph.node_feat and "events" not in graph.node_feat: graph.node_feat["alias"], graph.node_feat[ "events"] = graph_alias_sample_table(graph, "weight") log.info( "Build graph alias sample table done, and saving alias & evnets cache" ) np.save(alias_file, graph.node_feat["alias"]) np.save(events_file, graph.node_feat["events"]) return graph
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) # edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') edge_path = f'{dataset.dir}/full_edge_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') # paper log.info('adding paper edges') paper_graph = Graph.load(paper_edge_path, mmap_mode='r+') rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]] # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers rows += [row, col] cols += [col, row] # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] # edge_type log.info('building edge type') edge_types = [ np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows) ] edge_types = np.concatenate(edge_types, axis=0) log.info('building edges') row = np.concatenate(rows, axis=0) del rows col = np.concatenate(cols, axis=0) del cols edge_index = np.stack([row, col], axis=1) N = dataset.num_papers + dataset.num_authors + dataset.num_institutions full_graph = Graph(edge_index, num_nodes=N, edge_feat={'edge_type': edge_types}) full_graph.adj_dst_index full_graph.dump(edge_path) log.info( f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') self.graph._edge_feat['edge_type'] = self.graph._edge_feat[ 'edge_type'].astype('int32') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'feature x Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/all_feat_year.npy' author_year_path = f'{dataset.dir}/author_feat_year.npy' institution_year_path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full year matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_year_feat = dataset.all_paper_year author_year_feat = np.memmap(author_year_path, dtype=np.int32, shape=(dataset.num_authors), mode='r') institution_year_feat = np.memmap(institution_year_path, dtype=np.int32, shape=(dataset.num_institutions), mode='r') x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_year_feat[i:j] del paper_year_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_year_feat[i - start_idx:j - start_idx] del author_year_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_year_feat[i - start_idx:j - start_idx] del institution_year_feat x.flush() del x print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')