def load_data(data_name='cora', normalize_feature=True, missing_rate=0, cuda=False): # can use other dataset, some doesn't have mask data = geo_data.Planetoid(os.path.join(DATA_ROOT, data_name), data_name).data # original split data.train_mask = data.train_mask.type(torch.bool) data.val_mask = data.val_mask.type(torch.bool) # data.test_mask = data.test_mask.type(torch.bool) # expand test_mask to all rest nodes data.test_mask = ~(data.train_mask + data.val_mask) # get adjacency matrix n = len(data.x) adj = sp.csr_matrix((np.ones(data.edge_index.shape[1]), data.edge_index), shape=(n, n)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply( adj.T > adj) + sp.eye(adj.shape[0]) adj = normalize_adj_row( adj) # symmetric normalization works bad, but why? Test more. #data.adj = to_torch_sparse(adj) data.adj = torch.FloatTensor(np.array(adj.todense())) # normalize feature if normalize_feature: data.x = row_l1_normalize(data.x) # generate missing feature setting indices_dir = os.path.join(DATA_ROOT, data_name, 'indices') if not os.path.isdir(indices_dir): os.mkdir(indices_dir) missing_indices_file = os.path.join( indices_dir, "indices_missing_rate={}.npy".format(missing_rate)) if not os.path.exists(missing_indices_file): erasing_pool = torch.arange(n)[ ~data.train_mask] # keep training set always full feature size = int(len(erasing_pool) * (missing_rate / 100)) idx_erased = np.random.choice(erasing_pool, size=size, replace=False) np.save(missing_indices_file, idx_erased) else: idx_erased = np.load(missing_indices_file) # erasing feature for random missing if missing_rate > 0: data.x[idx_erased] = 0 if cuda: data.x = data.x.cuda() data.y = data.y.cuda() data.adj = data.adj.cuda() return data
def load_data(args): DATA_ROOT = 'datasets' path = osp.join(DATA_ROOT, args.data) data = geo_data.Planetoid(path, args.data)[0] # data.train_mask = data.train_mask.type(torch.bool) # data.val_mask = data.val_mask.type(torch.bool) # data.test_mask = data.test_mask.type(torch.bool) # expand test_mask to all rest nodes # data.test_mask = ~(data.train_mask + data.val_mask) # get adjacency matrix n = len(data.x) adj = sp.csr_matrix((np.ones(data.edge_index.shape[1]), data.edge_index), shape=(n, n)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply( adj.T > adj) + sp.eye(adj.shape[0]) adj, degree = normalize_adj_row( adj) # symmetric normalization works bad, but why? Test more. data.adj = to_torch_sparse(adj) return data, degree
import torch import torch_geometric import torch_geometric.data as gdata import torch_geometric.datasets as gdatasets import torch_geometric.transforms as gtransforms from config import batch_size transform = gtransforms.AddSelfLoops() # test if transform works # cora = gdatasets.KarateClub(transform=transform) # cora_loader = gdata.DataLoader(cora, batch_size=1, shuffle=True) cora = gdatasets.Planetoid(root='./Planetoid/Cora', name='Cora', transform=transform) cora_data = cora[0] cora_data.train_mask = torch.zeros(cora_data.num_nodes, dtype=torch.uint8) cora_data.train_mask[:cora_data.num_nodes-1000] = 1 cora_data.val_mask = None cora_data.test_mask = torch.zeros(cora_data.num_nodes, dtype=torch.uint8) cora_data.test_mask[cora_data.num_nodes-500:] = 1 # We only need the train part of the graph to train. num_features = cora.num_features num_classes = cora.num_classes # information about the given dataset/batch # if __name__ == '__main__':
def load_data(dataset, trn_ratio, verbose=False, seed=0): """ Read a dataset based on its name. """ root = 'data' root_cached = os.path.join(root, 'cached', dataset) if not os.path.exists(root_cached): if dataset == 'cora': data = datasets.Planetoid(root, 'Cora') elif dataset == 'citeseer': data = datasets.Planetoid(root, 'CiteSeer') elif dataset == 'pubmed': data = datasets.Planetoid(root, 'PubMed') elif dataset == 'cora-ml': data = datasets.CitationFull(root, 'Cora_ML') elif dataset == 'dblp': data = datasets.CitationFull(root, 'DBLP') elif dataset == 'amazon': data = datasets.Amazon(os.path.join(root, 'Amazon'), 'Photo') else: raise ValueError(dataset) node_x = data.data.x node_x[node_x.sum(dim=1) == 0] = 1 node_x = node_x / node_x.sum(dim=1, keepdim=True) node_y = data.data.y edges = preprocess_edges(data.data.edge_index) os.makedirs(root_cached, exist_ok=True) np.save(os.path.join(root_cached, 'x'), node_x) np.save(os.path.join(root_cached, 'y'), node_y) np.save(os.path.join(root_cached, 'edges'), edges) edges = np.load(os.path.join(root_cached, 'edges.npy')) node_x = np.load(os.path.join(root_cached, 'x.npy')) node_y = np.load(os.path.join(root_cached, 'y.npy')) indices = np.arange(node_x.shape[0]) trn_nodes, test_nodes = train_test_split(indices, test_size=0.1000, random_state=seed, stratify=node_y) trn_nodes, val_nodes = train_test_split(trn_nodes, test_size=0.1111, random_state=seed, stratify=node_y[trn_nodes]) trn_nodes, _ = train_test_split(trn_nodes, train_size=trn_ratio / 0.8, random_state=seed, stratify=node_y[trn_nodes]) edges = torch.from_numpy(edges) node_x = torch.from_numpy(node_x) node_y = torch.from_numpy(node_y) trn_nodes = torch.from_numpy(trn_nodes) val_nodes = torch.from_numpy(val_nodes) test_nodes = torch.from_numpy(test_nodes) if verbose: print('Number of nodes:', node_x.size(0)) print('Number of features:', node_x.size(1)) print('Number of edges:', edges.size(1) // 2) print('Number of classes:', node_y.max().item() + 1) return edges, node_x, node_y, trn_nodes, val_nodes, test_nodes
'''Makes some PyTorch-Geometric datasets available in LynxKite.''' import torch_geometric.datasets as ds from . import util op = util.Op() name = op.params["name"] print('loading dataset', name) if name == 'Karate Club': data = ds.KarateClub().data else: data = ds.Planetoid('/tmp/' + name, name).data op.output_vs('vs', len(data.x)) op.output_es('es', data.edge_index) op.output('x', data.x, type=util.DoubleVectorAttribute) op.output('y', data.y, type=util.DoubleAttribute)
def load_pubmed(): ple = datasets.Planetoid(root="./datasets/", name="PubMed") data = ple[0] return data.x, data.y, data.train_mask, data.val_mask, data.test_mask
def get_cora(): dataset = datasets.Planetoid(root='./dataset/Cora', name='Cora') return dataset.data
def get_citeseer(): dataset = datasets.Planetoid(root='./dataset/Citeseer', name='CiteSeer') return dataset.data