def sub_data_maker(data_name): name = 'Sub_{}'.format(data_name) path = osp.join(osp.dirname(osp.realpath(__file__)), 'data', data_name) if data_name == "Flickr": dataset = Flickr(path, data_name) else: dataset = Yelp(path, data_name) print(data_name) start = time.perf_counter() f_data = [] for i in range(0, 80000, 100): adj = k_hop_subgraph(i, 1, dataset.data.edge_index, relabel_nodes=True)[1] index = k_hop_subgraph(i, 1, dataset.data.edge_index, relabel_nodes=True)[0].numpy() feature = dataset.data.x[index] label = dataset.data.y[index] data = Data(x=feature, edge_index=adj, y=label) f_data.append(data) os.makedirs('./data/{}/processed'.format(name), exist_ok=True) torch.save(f_data, './data/{}/processed/data.pt'.format(name)) end = time.perf_counter() print("time consuming {:.2f}".format(end - start)) print(f_data[:10])
def load_dataset(dataset='flickr'): """ Args: dataset: str, name of dataset, assuming the raw dataset path is ./data/your_dataset/raw. torch_geometric.dataset will automatically preprocess the raw files and save preprocess dataset into ./data/your_dataset/preprocess Returns: dataset """ path = osp.join(osp.dirname(osp.realpath(__file__)), 'data', dataset) if dataset == 'flickr': dataset = Flickr(path) elif dataset == 'reddit': dataset = Reddit(path) elif dataset == 'ppi': dataset = PPI(path) elif dataset == 'ppi-large': dataset = PPI(path) elif dataset == 'yelp': dataset = Yelp(path) else: raise KeyError('Dataset name error') return dataset
def __init__(self, path: str): pyg_dataset = Flickr(os.path.join(path, '_pyg')) if hasattr(pyg_dataset, "__data_list__"): delattr(pyg_dataset, "__data_list__") if hasattr(pyg_dataset, "_data_list"): delattr(pyg_dataset, "_data_list") pyg_data = pyg_dataset[0] static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': pyg_data.x, 'y': pyg_data.y, 'train_mask': getattr(pyg_data, 'train_mask'), 'val_mask': getattr(pyg_data, 'val_mask'), 'test_mask': getattr(pyg_data, 'test_mask') }, pyg_data.edge_index) super(FlickrDataset, self).__init__([static_graph])
def get_dataset(dataset_name): """ Retrieves the dataset corresponding to the given name. """ path = join('dataset', dataset_name) if dataset_name == 'reddit': dataset = Reddit(path) elif dataset_name == 'flickr': dataset = Flickr(path) elif dataset_name == 'zinc': dataset = ZINC(root='dataset', subset=True, split='train') elif dataset_name == 'QM9': dataset = QM9(root='dataset') elif dataset_name == 'github': dataset = GitHub(path) elif dataset_name == 'ppi': dataset = PPI(path) elif dataset_name in ['amazon_comp', 'amazon_photo']: dataset = Amazon(path, "Computers", T.NormalizeFeatures() ) if dataset_name == 'amazon_comp' else Amazon( path, "Photo", T.NormalizeFeatures()) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(path, name=dataset_name, split="public", transform=T.NormalizeFeatures()) else: raise NotImplementedError return dataset
parser.add_argument('--ratio_graph', type=int, default=90) parser.add_argument("--draw", type=int, default=100) parser.add_argument('--use_gdc', type=bool, default=False) parser.add_argument('--save_file', type=str, default="model.pth.tar") parser.add_argument('--lookback', type=int, default=3) parser.add_argument("--thres", type=float, default=0.0) parser.add_argument("--dataset", type=str, default="CiteSeer") parser.add_argument("--log", type=str, default="{:05d}") args = parser.parse_args() dataset = args.dataset logging.basicConfig(filename=f"test_{dataset}_mask_change.txt", level=logging.DEBUG) path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) if dataset == "F": dataset = Flickr(path, transform=T.NormalizeFeatures()) print(len(dataset)) else: dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures()) # print(f"Number of graphs in {dataset} dataset:", len(dataset)) data = dataset[0] model, data = Net(dataset, data, args).to(device), data.to(device) checkpoint = torch.load(f"./pretrain_pytorch/{args.dataset}_model.pth.tar") model.load_state_dict(checkpoint) loss = lambda m: F.nll_loss(m()[data.train_mask], data.y[data.train_mask]) # print("construct admm training") support1 = model.adj1 # sparse support2 = model.adj2 # sparse partial_adj_mask = support1.clone() adj_variables = [support1, support2]
import os.path as osp import torch import torch.nn.functional as F from torch.nn import Linear from torch.utils.data import DataLoader from torch_geometric.datasets import Flickr import torch_geometric.transforms as T K = 2 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Flickr') transform = T.Compose([T.NormalizeFeatures(), T.SIGN(K)]) dataset = Flickr(path, transform=transform) data = dataset[0] train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) val_idx = data.val_mask.nonzero(as_tuple=False).view(-1) test_idx = data.test_mask.nonzero(as_tuple=False).view(-1) train_loader = DataLoader(train_idx, batch_size=16 * 1024, shuffle=True) val_loader = DataLoader(val_idx, batch_size=32 * 1024) test_loader = DataLoader(test_idx, batch_size=32 * 1024) class Net(torch.nn.Module): def __init__(self): super().__init__() self.lins = torch.nn.ModuleList() for _ in range(K + 1): self.lins.append(Linear(dataset.num_node_features, 1024))
import os.path as osp import argparse import torch import torch.nn.functional as F from torch_geometric.datasets import Flickr from torch_geometric.loader import GraphSAINTRandomWalkSampler from torch_geometric.nn import GraphConv from torch_geometric.utils import degree path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Flickr') dataset = Flickr(path) data = dataset[0] row, col = data.edge_index data.edge_weight = 1. / degree(col, data.num_nodes)[col] # Norm by in-degree. parser = argparse.ArgumentParser() parser.add_argument('--use_normalization', action='store_true') args = parser.parse_args() loader = GraphSAINTRandomWalkSampler(data, batch_size=6000, walk_length=2, num_steps=5, sample_coverage=100, save_dir=dataset.processed_dir, num_workers=4) class Net(torch.nn.Module): def __init__(self, hidden_channels):
def __init__(self, path): Flickr(path) super(FlickrDataset, self).__init__(path)
def load_dataset(device, args): """ Load dataset and move graph and features to device """ if args.dataset in [ "reddit", "cora", "ppi", "ppi_large", "yelp", "flickr" ]: # raise RuntimeError("Dataset {} is not supported".format(name)) if args.dataset == "reddit": from dgl.data import RedditDataset data = RedditDataset(self_loop=True) g = data[0] g = dgl.add_self_loop(g) n_classes = data.num_classes elif args.dataset == "cora": from dgl.data import CitationGraphDataset data = CitationGraphDataset('cora', raw_dir=os.path.join( args.data_dir, 'cora')) g = data[0] g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_classes = data.num_classes elif args.dataset == "ppi": data = load_ppi_data(args.data_dir) g = data.g n_classes = data.num_classes elif args.dataset == "ppi_large": data = load_ppi_large_data() g = data.g n_classes = data.num_classes elif args.dataset == "yelp": from torch_geometric.datasets import Yelp pyg_data = Yelp(os.path.join(args.data_dir, 'yelp'))[0] feat = pyg_data.x labels = pyg_data.y u, v = pyg_data.edge_index g = dgl.graph((u, v)) g.ndata['feat'] = feat g.ndata['label'] = labels g.ndata['train_mask'] = pyg_data.train_mask g.ndata['val_mask'] = pyg_data.val_mask g.ndata['test_mask'] = pyg_data.test_mask n_classes = labels.size(1) elif args.dataset == "flickr": from torch_geometric.datasets import Flickr pyg_data = Flickr(os.path.join(args.data_dir, "flickr"))[0] feat = pyg_data.x labels = pyg_data.y # labels = torch.argmax(labels, dim=1) u, v = pyg_data.edge_index g = dgl.graph((u, v)) g.ndata['feat'] = feat g.ndata['label'] = labels g.ndata['train_mask'] = pyg_data.train_mask g.ndata['val_mask'] = pyg_data.val_mask g.ndata['test_mask'] = pyg_data.test_mask n_classes = labels.max().item() + 1 train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] train_nid = train_mask.nonzero().squeeze().long() val_nid = val_mask.nonzero().squeeze().long() test_nid = test_mask.nonzero().squeeze().long() g = g.to(device) labels = g.ndata['label'] else: dataset = DglNodePropPredDataset(name=args.dataset, root=args.data_dir) splitted_idx = dataset.get_idx_split() train_nid = splitted_idx["train"] val_nid = splitted_idx["valid"] test_nid = splitted_idx["test"] g, labels = dataset[0] n_classes = dataset.num_classes g = g.to(device) if args.dataset == "ogbn-arxiv": g = dgl.add_reverse_edges(g, copy_ndata=True) g = dgl.add_self_loop(g) g.ndata['feat'] = g.ndata['feat'].float() elif args.dataset == "ogbn-papers100M": g = dgl.add_reverse_edges(g, copy_ndata=True) g.ndata['feat'] = g.ndata['feat'].float() labels = labels.long() elif args.dataset == "ogbn-mag": # MAG is a heterogeneous graph. The task is to make prediction for # paper nodes path = os.path.join(args.emb_path, f"{args.pretrain_model}_mag") labels = labels["paper"] train_nid = train_nid["paper"] val_nid = val_nid["paper"] test_nid = test_nid["paper"] features = g.nodes['paper'].data['feat'] author_emb = torch.load(os.path.join(path, "author.pt"), map_location=torch.device("cpu")).float() topic_emb = torch.load(os.path.join(path, "field_of_study.pt"), map_location=torch.device("cpu")).float() institution_emb = torch.load( os.path.join(path, "institution.pt"), map_location=torch.device("cpu")).float() g.nodes["author"].data["feat"] = author_emb.to(device) g.nodes["institution"].data["feat"] = institution_emb.to(device) g.nodes["field_of_study"].data["feat"] = topic_emb.to(device) g.nodes["paper"].data["feat"] = features.to(device) paper_dim = g.nodes["paper"].data["feat"].shape[1] author_dim = g.nodes["author"].data["feat"].shape[1] if paper_dim != author_dim: paper_feat = g.nodes["paper"].data.pop("feat") rand_weight = torch.Tensor(paper_dim, author_dim).uniform_(-0.5, 0.5) g.nodes["paper"].data["feat"] = torch.matmul( paper_feat, rand_weight.to(device)) print( f"Randomly project paper feature from dimension {paper_dim} to {author_dim}" ) labels = labels.to(device).squeeze() n_classes = int(labels.max() - labels.min()) + 1 else: g.ndata['feat'] = g.ndata['feat'].float() labels = labels.squeeze() evaluator = get_evaluator(args.dataset) print(f"# Nodes: {g.number_of_nodes()}\n" f"# Edges: {g.number_of_edges()}\n" f"# Train: {len(train_nid)}\n" f"# Val: {len(val_nid)}\n" f"# Test: {len(test_nid)}\n" f"# Classes: {n_classes}") return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
elif DATASET == "Reddit": real_data = Reddit(root=input_path) elif DATASET == "Amazon Computers": real_data = Amazon(root=input_path, name="Computers") elif DATASET == "Amazon Photos": real_data = Amazon(root=input_path, name="Photo") elif DATASET == "CLUSTER": real_data = GNNBenchmarkDataset(root=input_path, name="CLUSTER", split="test") elif DATASET == "PATTERN": real_data = GNNBenchmarkDataset(root=input_path, name="PATTERN", split="test") elif DATASET == "Flickr": real_data = Flickr(root=input_path) elif DATASET == "OGB Products": real_data = PygNodePropPredDataset(name='ogbn-products') split_idx = real_data.get_idx_split() elif DATASET == "GitHub Network": gitGraph = from_networkx(load_graph(input_path + '/musae_git_edges.csv')) gitGraph.x = torch.tensor( load_features(input_path + '/musae_git_features.json')) gitGraph.y = torch.tensor( load_targets(input_path + '/musae_git_target.csv')) elif DATASET == "SBM": # Size of blocks COMMUNITY_SIZE = 400 # Number of clusters
def prepare_data(self): path = osp.join(osp.dirname(osp.realpath(__file__)), "..", "..", "data", self.NAME) self.dataset = Flickr(path) self.data = self.dataset[0]
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', type=str, default='0') parser.add_argument('--model', type=str, default='indGCN') # indGCN, GraphSAGE parser.add_argument('--dataset', type=str, default='Reddit') # Reddit; Flickr parser.add_argument('--batch', type=int, default=512) # 512; 1024 parser.add_argument('--runs', type=int, default=10) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--hidden', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--binarize', action='store_true') args = parser.parse_args() print(args) assert args.dataset in [ 'Flickr', 'Reddit' ], 'For dataset, only Flickr and Reddit are available' path = '/home/wangjunfu/dataset/graph/' + str(args.dataset) if args.dataset == 'Flickr': dataset = Flickr(path) else: dataset = Reddit(path) data = dataset[0] train_loader = NeighborSampler(data.edge_index, node_idx=data.train_mask, sizes=[25, 10], batch_size=args.batch, shuffle=True, num_workers=12) subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], batch_size=args.batch, shuffle=False, num_workers=12) device = torch.device( f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu') assert args.model in ['indGCN', 'GraphSAGE' ], 'Only indGCN and GraphSAGE are available.' model = NeighborSamplingGCN(args.model, dataset.num_features, args.hidden, dataset.num_classes, args.binarize, args.dropout).to(device) test_accs = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_test = 0.0 best_val = 0.0 for epoch in range(1, args.epochs + 1): loss, acc = train(model, data, train_loader, optimizer, device) train_f1, val_f1, test_f1 = test(model, data, subgraph_loader, device) if val_f1 > best_val: best_val = val_f1 best_test = test_f1 if args.runs == 1: print( "Epoch: {:d}, Loss:{:.4f}, Train f1: {:.4f}, Val f1: {:.4f}, Test f1: {:.4f}" .format(epoch, loss, train_f1, val_f1, test_f1)) test_accs.append(best_test) print("Run: {:d}, best_test: {:.4f}".format(run, best_test)) test_accs = torch.tensor(test_accs) print("Average test f1 score:{:.4f} ± {:.4f}".format( test_accs.mean(), test_accs.std()))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--device", type=str, default='0') parser.add_argument('--model', type=str, default='GraphSAINT') parser.add_argument('--dataset', type=str, default='Reddit') # Reddit or Flickr parser.add_argument('--batch', type=int, default=2000) # Reddit:2000, Flickr:6000 parser.add_argument('--walk_length', type=int, default=4) # Reddit:4, Flickr:2 parser.add_argument('--sample_coverage', type=int, default=50) # Reddit:50, Flickr:100 parser.add_argument('--runs', type=int, default=10) parser.add_argument('--epochs', type=int, default=100) # 100, 50 parser.add_argument('--lr', type=float, default=0.01) # 0.01, 0.001 parser.add_argument('--weight_decay', type=float, default=0.0005) parser.add_argument('--hidden', type=int, default=256) # 128, 256 parser.add_argument('--dropout', type=float, default=0.1) # 0.1, 0.2 parser.add_argument('--use_normalization', action='store_true') parser.add_argument('--binarize', action='store_true') args = parser.parse_args() assert args.model in ['GraphSAINT'] assert args.dataset in ['Flickr', 'Reddit'] path = '/home/wangjunfu/dataset/graph/' + str(args.dataset) if args.dataset == 'Flickr': dataset = Flickr(path) else: dataset = Reddit(path) data = dataset[0] row, col = data.edge_index data.edge_weight = 1. / degree(col, data.num_nodes)[col] # Norm by in-degree. loader = GraphSAINTRandomWalkSampler(data, batch_size=args.batch, walk_length=args.walk_length, num_steps=5, sample_coverage=args.sample_coverage, save_dir=dataset.processed_dir, num_workers=0) device = torch.device( f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu') model = SAINT(data.num_node_features, args.hidden, dataset.num_classes, args.dropout, args.binarize).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) val_f1s, test_f1s = [], [] for run in range(1, args.runs + 1): best_val, best_test = 0, 0 model.reset_parameters() start_time = time.time() for epoch in range(1, args.epochs + 1): loss = train(model, loader, optimizer, device, args.use_normalization) accs = test(model, data, device, args.use_normalization) if accs[1] > best_val: best_val = accs[1] best_test = accs[2] if args.runs == 1: print( f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {accs[0]:.4f}, ' f'Val: {accs[1]:.4f}, Test: {accs[2]:.4f}') test_f1s.append(best_test) print( "Run: {:d}, best val: {:.4f}, best test: {:.4f}, time cost: {:d}s". format(run, best_val, best_test, int(time.time() - start_time))) test_f1s = torch.tensor(test_f1s) print("{:.4f} ± {:.4f}".format(test_f1s.mean(), test_f1s.std()))