def __init__(self, args=None): dataset = "jknet_cora" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) if not osp.exists(path): os.makedirs(path) super(CoraDataset, self).__init__(path) with open(self.processed_paths[0], 'rb') as fin: load_data = pickle.load(fin) self.num_nodes = load_data['node_num'] data = Data() data.x = load_data['xs'] data.y = load_data['ys'] train_size = int(self.num_nodes * 0.8) train_mask = np.zeros((self.num_nodes, ), dtype=bool) train_idx = np.random.choice(np.arange(self.num_nodes), size=train_size, replace=False) train_mask[train_idx] = True test_mask = np.ones((self.num_nodes, ), dtype=bool) test_mask[train_idx] = False val_mask = test_mask edges = load_data['edges'] edges = np.array(edges, dtype=int).transpose((1, 0)) data.edge_index = torch.from_numpy(edges) data.train_mask = torch.from_numpy(train_mask) data.test_mask = torch.from_numpy(test_mask) data.val_mask = torch.from_numpy(val_mask) data.x = torch.Tensor(data.x) data.y = torch.LongTensor(data.y) self.data = data self.num_classes = torch.max(self.data.y).item() + 1
def read_triplet_data(folder): filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"] count = 0 edge_index = [] edge_attr = [] count_list = [] for filename in filenames: with open(osp.join(folder, filename), "r") as f: num = int(f.readline().strip()) for line in f: items = line.strip().split() edge_index.append([int(items[0]), int(items[1])]) edge_attr.append(int(items[2])) count += 1 count_list.append(count) edge_index = torch.LongTensor(edge_index).t() edge_attr = torch.LongTensor(edge_attr) data = Data() data.edge_index = edge_index data.edge_attr = edge_attr def generate_mask(start, end): mask = torch.BoolTensor(count) mask[:] = False mask[start:end] = True return mask data.train_mask = generate_mask(0, count_list[0]) data.val_mask = generate_mask(count_list[0], count_list[1]) data.test_mask = generate_mask(count_list[1], count_list[2]) return data
def get_subgraph(self, phase, require_norm=True): """ Generate one minibatch for model. In the 'train' mode, one minibatch corresponds to one subgraph of the training graph. In the 'valid' or 'test' mode, one batch corresponds to the full graph (i.e., full-batch rather than minibatch evaluation for validation / test sets). Inputs: mode str, can be 'train', 'valid', 'test' require_norm boolean Outputs: data Data object, modeling the sampled subgraph data.norm_aggr aggregation normalization data.norm_loss normalization normalization """ if phase in ['val', 'test']: node_subgraph = np.arange(self.data.num_nodes) data = self.data if require_norm: data.norm_aggr = torch.ones(self.num_edges) data.norm_loss = self.norm_loss_test else: if len(self.subgraphs_nodes) == 0: self.gen_subgraph() node_subgraph = self.subgraphs_nodes.pop() edge_subgraph = self.subgraphs_edge_index.pop() num_nodes_subgraph = node_subgraph.size adj = sp.csr_matrix( (self.subgraphs_data.pop(), self.subgraphs_indices.pop(), self.subgraphs_indptr.pop()), shape=(num_nodes_subgraph, num_nodes_subgraph)) if require_norm: adj.data[:] = self.norm_aggr_train[edge_subgraph][:] #normalization D = adj.sum(1).flatten() norm_diag = sp.dia_matrix((1 / D, 0), shape=adj.shape) adj = norm_diag.dot(adj) adj.sort_indices() adj = adj.tocoo() data = Data( self.data.x[node_subgraph], torch.LongTensor(np.vstack( (adj.row, adj.col))), None if self.data.edge_attr is None else self.data.edge_attr[edge_subgraph], self.data.y[node_subgraph], None if self.data.pos is None else self.data.pos[node_subgraph]) if require_norm: data.norm_aggr = torch.FloatTensor(adj.data) data.norm_loss = self.norm_loss_train[node_subgraph] data.train_mask = self.data.train_mask[node_subgraph] data.val_mask = self.data.val_mask[node_subgraph] data.test_mask = self.data.test_mask[node_subgraph] return data
def read_planetoid_data(folder, prefix): prefix = prefix.lower() names = ["x", "tx", "allx", "y", "ty", "ally", "graph", "test.index"] objects = [] for item in names[:-1]: with open(f"{folder}/ind.{prefix}.{item}", "rb") as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding="latin1")) else: objects.append(pkl.load(f)) test_index = parse_index_file(f"{folder}/ind.{prefix}.{names[-1]}") test_index = torch.Tensor(test_index).long() test_index_reorder = test_index.sort()[0] x, tx, allx, y, ty, ally, graph = tuple(objects) x, tx, allx = tuple( [torch.from_numpy(item.todense()).float() for item in [x, tx, allx]]) y, ty, ally = tuple( [torch.from_numpy(item).float() for item in [y, ty, ally]]) train_index = torch.arange(y.size(0), dtype=torch.long) val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long) if prefix.lower() == "citeseer": # There are some isolated nodes in the Citeseer graph, resulting in # none consecutive test indices. We need to identify them and add them # as zero vectors to `tx` and `ty`. len_test_indices = (test_index.max() - test_index.min()).item() + 1 tx_ext = torch.zeros(len_test_indices, tx.size(1)) tx_ext[test_index_reorder - test_index.min(), :] = tx ty_ext = torch.zeros(len_test_indices, ty.size(1)) ty_ext[test_index_reorder - test_index.min(), :] = ty tx, ty = tx_ext, ty_ext x = torch.cat([allx, tx], dim=0).float() y = torch.cat([ally, ty], dim=0).max(dim=1)[1].long() x[test_index] = x[test_index_reorder] y[test_index] = y[test_index_reorder] train_mask = index_to_mask(train_index, size=y.size(0)) val_mask = index_to_mask(val_index, size=y.size(0)) test_mask = index_to_mask(test_index, size=y.size(0)) edge_index = edge_index_from_dict(graph, num_nodes=y.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data
def read_triplet_data(folder): filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"] count = 0 edge_index = [] edge_attr = [] count_list = [] triples = [] num_entities = 0 num_relations = 0 entity_dic = {} relation_dic = {} for filename in filenames: with open(osp.join(folder, filename), "r") as f: _ = int(f.readline().strip()) if "train" in filename: train_start_idx = len(triples) elif "valid" in filename: valid_start_idx = len(triples) elif "test" in filename: test_start_idx = len(triples) for line in f: items = line.strip().split() edge_index.append([int(items[0]), int(items[1])]) edge_attr.append(int(items[2])) triples.append((int(items[0]), int(items[2]), int(items[1]))) if items[0] not in entity_dic: entity_dic[items[0]] = num_entities num_entities += 1 if items[1] not in entity_dic: entity_dic[items[1]] = num_entities num_entities += 1 if items[2] not in relation_dic: relation_dic[items[2]] = num_relations num_relations += 1 count += 1 count_list.append(count) edge_index = torch.LongTensor(edge_index).t() edge_attr = torch.LongTensor(edge_attr) data = Data() data.edge_index = edge_index data.edge_attr = edge_attr def generate_mask(start, end): mask = torch.BoolTensor(count) mask[:] = False mask[start:end] = True return mask data.train_mask = generate_mask(0, count_list[0]) data.val_mask = generate_mask(count_list[0], count_list[1]) data.test_mask = generate_mask(count_list[1], count_list[2]) return data, triples, train_start_idx, valid_start_idx, test_start_idx, num_entities, num_relations
def process(self): data = np.load(osp.join(self.raw_dir, "reddit_data.npz")) x = torch.from_numpy(data["feature"]).to(torch.float) y = torch.from_numpy(data["label"]).to(torch.long) split = torch.from_numpy(data["node_types"]) adj = sp.load_npz(osp.join(self.raw_dir, "reddit_graph.npz")) row = torch.from_numpy(adj.row).to(torch.long) col = torch.from_numpy(adj.col).to(torch.long) edge_index = torch.stack([row, col], dim=0) edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = split == 1 data.val_mask = split == 2 data.test_mask = split == 3 torch.save(self.collate([data]), self.processed_paths[0])
def read_planetoid_data(folder, prefix): names = ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index'] items = [read_file(folder, prefix, name) for name in names] x, tx, allx, y, ty, ally, graph, test_index = items train_index = torch.arange(y.size(0), dtype=torch.long) val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long) sorted_test_index = test_index.sort()[0] if prefix.lower() == 'citeseer': # There are some isolated nodes in the Citeseer graph, resulting in # none consecutive test indices. We need to identify them and add them # as zero vectors to `tx` and `ty`. len_test_indices = (test_index.max() - test_index.min()).item() + 1 tx_ext = torch.zeros(len_test_indices, tx.size(1)) tx_ext[sorted_test_index - test_index.min(), :] = tx ty_ext = torch.zeros(len_test_indices, ty.size(1)) ty_ext[sorted_test_index - test_index.min(), :] = ty tx, ty = tx_ext, ty_ext x = torch.cat([allx, tx], dim=0) y = torch.cat([ally, ty], dim=0).max(dim=1)[1] x[test_index] = x[sorted_test_index] y[test_index] = y[sorted_test_index] train_mask = sample_mask(train_index, num_nodes=y.size(0)) val_mask = sample_mask(val_index, num_nodes=y.size(0)) test_mask = sample_mask(test_index, num_nodes=y.size(0)) edge_index = edge_index_from_dict(graph, num_nodes=y.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data