def create_data(contour, translation, img_path_0, img_path_1, osvos_model, k): """Returns a torch_geometric.data object. The data object consists of: * x: Node feature matrix of shape [num_nodes, num_node_features]. The feature of each node are the concatenated OSVOS feature vectors of the current and the next frame. * edge_index: Graph connectivity in COO format of shape (2, num_edges) and type torch.long Each node should be connected to its K nearest neighbours * edge_attr: Edge feature matrix with shape [num_edges, num_edge_features] The feature of each edge is the inverse distance between the two nodes it connects * y, optianl: The target of each node is the displacement of the node between the current and the next frame Parameters ---------- contour : ndarray Array of shape (num_contour_points, 2) containing contour points translation : ndarray of shape (num_contour_points, 2) Array of shape (num_contour_points, 2) containing translations from current to next contour point img_path_0 : str Path to current image img_path_0 : str Path to next image osvos_model : torch.nn.Sequential OSVOS model from which feature vectors can be extracted k : int Number of neighbours to compute KNN graph of contour points Returns ------- data : torch_geometric.data Data object containing consisting of x, edge_index, edge_attr, and y """ contour = torch.from_numpy(contour) img_0 = np.moveaxis(imread(img_path_0), 2, 0).astype(np.float64) img_0 = np.expand_dims(img_0, axis=0) img_0 = torch.from_numpy(img_0) img_1 = np.moveaxis(imread(img_path_1), 2, 0).astype(np.float64) img_1 = np.expand_dims(img_1, axis=0) img_1 = torch.from_numpy(img_1) # x: Node feature matrix x_1 = get_OSVOS_feature_vectors(contour, img_0, osvos_model) x_2 = get_OSVOS_feature_vectors(contour, img_1, osvos_model) x = torch.cat((x_1, x_2), 1) # edge_index: Graph connectivity in COO format edge_index = knn_graph(contour, k) edge_index = to_undirected(edge_index) # edge_attr: Edge feature matrix edge_attr = get_edge_attribute(contour, edge_index) # Create data object if translation is None: data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, contour=contour) else: # The target of each node is the displacement of the node between the current and the next frame y = torch.from_numpy(translation.astype(np.float64)) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, contour=contour) return data
def tree_decomposition(mol): r"""The tree decomposition algorithm of molecules from the `"Junction Tree Variational Autoencoder for Molecular Graph Generation" <https://arxiv.org/abs/1802.04364>`_ paper. Returns the graph connectivity of the junction tree, the assignment mapping of each atom to the clique in the junction tree, and the number of cliques. Args: mol (rdkit.Chem.Mol): A :obj:`rdkit` molecule. :rtype: (LongTensor, LongTensor, int) """ if Chem is None: raise ImportError('Package `rdkit` could not be found.') # Cliques = rings and bonds. cliques = [list(x) for x in Chem.GetSymmSSSR(mol)] for bond in mol.GetBonds(): if not bond.IsInRing(): cliques.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) # Generate `atom2clique` mappings. atom2clique = [[] for i in range(mol.GetNumAtoms())] for c in range(len(cliques)): for atom in cliques[c]: atom2clique[atom].append(c) # Merge rings that share more than 2 atoms as they form bridged compounds. for c1 in range(len(cliques)): for atom in cliques[c1]: for c2 in atom2clique[atom]: if c1 >= c2 or len(cliques[c1]) <= 2 or len(cliques[c2]) <= 2: continue if len(set(cliques[c1]) & set(cliques[c2])) > 2: cliques[c1] = set(cliques[c1]) | set(cliques[c2]) cliques[c2] = [] cliques = [c for c in cliques if len(c) > 0] # Update `atom2clique` mappings. atom2clique = [[] for i in range(mol.GetNumAtoms())] for c in range(len(cliques)): for atom in cliques[c]: atom2clique[atom].append(c) # Add singleton cliques in case there are more than 2 intersecting # cliques. We further compute the "initial" clique graph. edges = {} for atom in range(mol.GetNumAtoms()): cs = atom2clique[atom] if len(cs) <= 1: continue # Number of bond clusters that the atom lies in. bonds = [c for c in cs if len(cliques[c]) == 2] # Number of ring clusters that the atom lies in. rings = [c for c in cs if len(cliques[c]) > 4] if len(bonds) > 2 or (len(bonds) == 2 and len(cs) > 2): cliques.append([atom]) c2 = len(cliques) - 1 for c1 in cs: edges[(c1, c2)] = 1 elif len(rings) > 2: cliques.append([atom]) c2 = len(cliques) - 1 for c1 in cs: edges[(c1, c2)] = 99 else: for i in range(len(cs)): for j in range(i + 1, len(cs)): c1, c2 = cs[i], cs[j] count = len(set(cliques[c1]) & set(cliques[c2])) edges[(c1, c2)] = min(count, edges.get((c1, c2), 99)) if len(edges) > 0: edge_index_T, weight = zip(*edges.items()) row, col = torch.tensor(edge_index_T).t() inv_weight = 100 - torch.tensor(weight) clique_graph = SparseTensor(row=row, col=col, value=inv_weight, sparse_sizes=(len(cliques), len(cliques))) junc_tree = minimum_spanning_tree(clique_graph.to_scipy('csr')) row, col, _ = SparseTensor.from_scipy(junc_tree).coo() edge_index = torch.stack([row, col], dim=0) edge_index = to_undirected(edge_index, num_nodes=len(cliques)) else: edge_index = torch.empty((2, 0), dtype=torch.long) rows = [[i] * len(atom2clique[i]) for i in range(mol.GetNumAtoms())] row = torch.tensor(list(chain.from_iterable(rows))) col = torch.tensor(list(chain.from_iterable(atom2clique))) atom2clique = torch.stack([row, col], dim=0) return edge_index, atom2clique, len(cliques)
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) adj = SparseTensor(row=edge_index[0], col=edge_index[1]) if args.use_sage: model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) else: model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) # Pre-compute GCN normalization. adj = adj.set_diag() deg = adj.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, adj, y_true, train_idx, optimizer) result = test(model, x, adj, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def main(): args = ArgsInit().save_exp() if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") else: device = torch.device('cpu') dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] sub_dir = 'SL_{}'.format(args.self_loop) args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes logging.info('%s' % args) model = DeeperGCN(args).to(device) logging.info(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0} start_time = time.time() for epoch in range(1, args.epochs + 1): # epoch_loss = train(model, x, edge_index, y_true, train_idx, optimizer) epoch_loss = train_flag(model, x, edge_index, y_true, train_idx, optimizer, device, args) logging.info('Epoch {}, training loss {:.4f}'.format(epoch, epoch_loss)) model.print_params(epoch=epoch) result = test(model, x, edge_index, y_true, split_idx, evaluator) logging.info(result) train_accuracy, valid_accuracy, test_accuracy = result if train_accuracy > results['highest_train']: results['highest_train'] = train_accuracy if valid_accuracy > results['highest_valid']: results['highest_valid'] = valid_accuracy results['final_train'] = train_accuracy results['final_test'] = test_accuracy save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='valid_best') logging.info("%s" % results) end_time = time.time() total_time = end_time - start_time logging.info('Total time: {}'.format(time.strftime('%H:%M:%S', time.gmtime(total_time))))
def step1_inductive(task, name, ego_size=128, num_iter=1000, log_steps=10000, num_workers=16, method='acl'): dataset = create_dataset(name=f'{task}-{name}') data = dataset[0] N = data.num_nodes edge_index = data.edge_index edge_index = to_undirected(edge_index) if hasattr(data, "edge_index_train"): edge_index_train = data.edge_index_train edge_index_train = to_undirected(edge_index_train) else: edge_index_train = edge_index adj = csr_matrix((np.ones(edge_index.shape[1]), edge_index), shape=(N, N)) adj_train = csr_matrix( (np.ones(edge_index_train.shape[1]), edge_index_train), shape=(N, N)) idx_split = dataset.get_idx_split() train_idx = idx_split["train"].cpu().numpy() valid_idx = idx_split["valid"].cpu().numpy() test_idx = idx_split["test"].cpu().numpy() global graphlocal global graphlocal_train graphlocal = GraphLocal.from_sparse_adjacency(adj) graphlocal_train = GraphLocal.from_sparse_adjacency(adj_train) print('graphlocal generated') with multiprocessing.Pool(num_workers) as pool: ego_graphs_train, conds_train = zip( *pool.imap(calc_inductive_train, [(i, log_steps, num_iter, ego_size, method) for i in train_idx], chunksize=512)) with multiprocessing.Pool(num_workers) as pool: ego_graphs_valid, conds_valid = zip( *pool.imap(calc_inductive, [(i, log_steps, num_iter, ego_size, method) for i in valid_idx], chunksize=512)) with multiprocessing.Pool(num_workers) as pool: ego_graphs_test, conds_test = zip( *pool.imap(calc_inductive, [(i, log_steps, num_iter, ego_size, method) for i in test_idx], chunksize=512)) ego_graphs = [] conds = [] ego_graphs.extend(ego_graphs_train) ego_graphs.extend(ego_graphs_valid) ego_graphs.extend(ego_graphs_test) conds.extend(conds_train) conds.extend(conds_valid) conds.extend(conds_test) if method == 'acl': np.save(f"data/{name}-lc-ego-graphs-{ego_size}.npy", ego_graphs) np.save(f"data/{name}-lc-conds-{ego_size}.npy", conds) else: np.save(f"data/{name}-lc-{method}-ego-graphs-{ego_size}.npy", ego_graphs) np.save(f"data/{name}-lc-{method}-conds-{ego_size}.npy", conds)
def test_to_undirected(): row = torch.tensor([0, 1, 1]) col = torch.tensor([1, 0, 2]) edge_index = to_undirected(torch.stack([row, col], dim=0)) assert edge_index.tolist() == [[0, 1, 1, 2], [1, 0, 2, 1]]
args.data_appendix += '_mnph{}'.format(args.max_nodes_per_hop) args.res_dir = os.path.join('results/{}{}'.format(args.dataset, args.save_appendix)) print('Results will be saved in ' + args.res_dir) if not os.path.exists(args.res_dir): os.makedirs(args.res_dir) if "ogbl" in args.dataset: dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] split_edge = dataset.get_edge_split() if args.use_valedges_as_input: val_edge_index = split_edge['valid']['edge'].t() val_edge_index = to_undirected(val_edge_index) data.edge_index = torch.cat([data.edge_index, val_edge_index], dim=-1) val_edge_weight = torch.ones([val_edge_index.size(1), 1], dtype=int) data.edge_weight = torch.cat([data.edge_weight, val_edge_weight], 0) else: if args.dataset_file is None: print("Dataset file required.") sys.exit() s, d, w = [], [], [] with open(args.dataset_file, 'r') as f: for index, line in enumerate(f): t1, t2, t3 = line.strip().split(" ") s.append(t1) d.append(t2)
def test_edges(data, cold_mask_node, val_ratio=0.05, test_ratio=0.1): r"""Splits the edges of a :obj:`torch_geometric.data.Data` object into positive and negative train/val/test edges, and adds attributes of `train_pos_edge_index`, `train_neg_adj_mask`, `val_pos_edge_index`, `val_neg_edge_index`, `test_pos_edge_index`, and `test_neg_edge_index` to :attr:`data`. Args: data (Data): The data object. val_ratio (float, optional): The ratio of positive validation edges. (default: :obj:`0.05`) test_ratio (float, optional): The ratio of positive test edges. (default: :obj:`0.1`) :rtype: :class:`torch_geometric.data.Data` """ assert 'batch' not in data # No batch-mode. device = data.x.device num_nodes = data.num_nodes row, col = data.edge_index # data.edge_index = None # Return upper triangular portion. mask = row < col row, col = row[mask], col[mask] # Select train nodes edge_row, edge_col = row, col # print(data.edge_index.size()) data.cold_mask_node = cold_mask_node # Select validate nodes for ind, i in enumerate(cold_mask_node): index = (row == i).nonzero() if (ind == 0): indice = index else: indice = torch.cat((indice, index), 0) test_indice = indice.squeeze() # print(test_indice.size()) a_r, a_c = row[test_indice], col[test_indice] data.test_pos_edge_index = torch.stack([a_r, a_c], dim=0) # print(data.test_pos_edge_index.size()) edge_mask = torch.Tensor(edge_row.size(0)).type(torch.bool).to( data.x.device) # print(edge_mask.size()) edge_mask[edge_mask < 1] = True # print(edge_mask.sum()) edge_mask = edge_mask.scatter_(0, test_indice, False) # print(edge_mask.sum()) # print(edge_row.s/=ize()) edge_row = edge_row[edge_mask] edge_col = edge_col[edge_mask] # print(edge_row.size()) data.total_edge_index = torch.stack((edge_row, edge_col), dim=0) # print(data.total_edge_index.size()) # print(all_indice.size()) # Negative edges. neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8) neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool) neg_adj_mask[row, col] = 0 #inverse adj made neg_row, neg_col = neg_adj_mask.nonzero().t() # test negative for ind, i in enumerate(cold_mask_node): index = (neg_row == i).nonzero() if (ind == 0): indice = index else: indice = torch.cat((indice, index), 0) neg_test_indice = indice.squeeze() # perm_test = random.sample(range(neg_test_indice.size(0)), # test_indice.size(0)) # perm_test = torch.tensor(perm_test).to(torch.long) # neg_a_r, neg_a_c = neg_row[perm_val], neg_col[perm_val] # data.test_neg_edge_index = torch.stack([neg_a_r, neg_a_c], dim=0).to(device) neg_a_r, neg_a_c = neg_row[neg_test_indice], neg_col[neg_test_indice] data.test_neg_edge_index = torch.stack([neg_a_r, neg_a_c], dim=0).to(device) data.total_edge_index = to_undirected(data.total_edge_index) return data
else: dataset.graph['edge_index'] = to_sparse_tensor( dataset.graph['edge_index'], dataset.graph['edge_feat'], dataset.graph['num_nodes']) dataset.graph['node_feat'] = dataset.graph['edge_index'].mean(dim=1) dataset.graph['edge_index'].set_value_(None) dataset.graph['edge_feat'] = None n = dataset.graph['num_nodes'] # infer the number of classes for non one-hot and one-hot labels c = max(dataset.label.max().item() + 1, dataset.label.shape[1]) d = dataset.graph['node_feat'].shape[1] # whether or not to symmetrize if not args.directed and args.dataset != 'ogbn-proteins': dataset.graph['edge_index'] = to_undirected(dataset.graph['edge_index']) dataset.graph['edge_index'], dataset.graph['node_feat'] = \ dataset.graph['edge_index'].to(device), dataset.graph['node_feat'].to(device) print(f"num nodes {n} | num classes {c} | num node feats {d}") ### Load method ### model = parse_method(args, dataset, n, c, d, device) # using rocauc as the eval function if args.rocauc or args.dataset in ('yelp-chi', 'twitch-e', 'ogbn-proteins'): criterion = nn.BCEWithLogitsLoss() eval_func = eval_rocauc else: criterion = nn.NLLLoss()
def main(): parser = argparse.ArgumentParser(description="OGBL-Citation2 (GraphSAINT)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=16 * 1024) parser.add_argument("--walk_length", type=int, default=3) parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--epochs", type=int, default=200) parser.add_argument("--num_steps", type=int, default=100) parser.add_argument("--eval_steps", type=int, default=10) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-citation2") split_edge = dataset.get_edge_split() data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) loader = GraphSAINTRandomWalkSampler( data, batch_size=args.batch_size, walk_length=args.walk_length, num_steps=args.num_steps, sample_coverage=0, save_dir=dataset.processed_dir, ) # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596] split_edge["eval_train"] = { "source_node": split_edge["train"]["source_node"][idx], "target_node": split_edge["train"]["target_node"][idx], "target_node_neg": split_edge["valid"]["target_node_neg"], } model = GCN( data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-citation2") logger = Logger(args.runs, args) run_idx = 0 while run_idx < args.runs: model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) run_success = True for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, loader, optimizer, device) print( f"Run: {run_idx + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}" ) if loss > 2.0: run_success = False logger.reset(run_idx) print("Learning failed. Rerun...") break if epoch > 49 and epoch % args.eval_steps == 0: result = test( model, predictor, data, split_edge, evaluator, batch_size=64 * 1024, device=device, ) logger.add_result(run_idx, result) train_mrr, valid_mrr, test_mrr = result print(f"Run: {run_idx + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {train_mrr:.4f}, " f"Valid: {valid_mrr:.4f}, " f"Test: {test_mrr:.4f}") print("GraphSAINT") if run_success: logger.print_statistics(run_idx) run_idx += 1 print("GraphSAINT") logger.print_statistics()
def main(args): date_time = datetime.now().strftime('%m-%d-%H:%M:%S') log_path = os.path.join(args.log_root, args.log_path, args.save_name, date_time) load_func, subset = args.dataset.split('/')[0], args.dataset.split('/')[1] if load_func == 'WebKB': load_func = WebKB dataset = load_func(root=args.data_path, name=subset) elif load_func == 'WikipediaNetwork': load_func = WikipediaNetwork dataset = load_func(root=args.data_path, name=subset) elif load_func == 'WikiCS': load_func = WikiCS dataset = load_func(root=args.data_path) elif load_func == 'cora_ml': dataset = load_citation_link( root='../dataset/data/tmp/cora_ml/cora_ml.npz') elif load_func == 'citeseer': dataset = load_citation_link( root='../dataset/data/tmp/citeseer_npz/citeseer_npz.npz') #load telegram/synthetic here else: dataset = load_syn(args.data_path + args.dataset, None) if os.path.isdir(log_path) == False: os.makedirs(log_path) # load dataset if 'dataset' in locals(): data = dataset[0] edge_index = data.edge_index #feature = dataset[0].x.data size = torch.max(edge_index).item() + 1 # generate edge index dataset #if args.task == 2: # datasets = generate_dataset_2class(edge_index, splits = 10, test_prob = args.drop_prob) #else: save_file = args.data_path + args.dataset + '/' + subset datasets = generate_dataset_3class(edge_index, size, save_file, splits=10, probs=args.split_prob, task=args.task, label_dim=args.num_class_link) if args.task != 2: results = np.zeros((10, 4)) else: results = np.zeros((10, 4, 5)) for i in range(10): log_str_full = '' edges = datasets[i]['graph'] if args.to_undirected: edges = to_undirected(edges) ######################################## # initialize model and load dataset ######################################## #x = torch.ones(size).unsqueeze(-1).to(device) x = in_out_degree(edges, size).to(device) edges = edges.long().to(device) model = GCN_Link(x.size(-1), args.num_class_link, filter_num=args.num_filter, dropout=args.dropout).to(device) #model = nn.DataParallel(graphmodel) opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) y_train = datasets[i]['train']['label'] y_val = datasets[i]['validate']['label'] y_test = datasets[i]['test']['label'] y_train = torch.from_numpy(y_train).long().to(device) y_val = torch.from_numpy(y_val).long().to(device) y_test = torch.from_numpy(y_test).long().to(device) train_index = torch.from_numpy( datasets[i]['train']['pairs']).to(device) val_index = torch.from_numpy( datasets[i]['validate']['pairs']).to(device) test_index = torch.from_numpy(datasets[i]['test']['pairs']).to(device) ################################# # Train/Validation/Test ################################# best_test_err = 1000.0 early_stopping = 0 for epoch in range(args.epochs): start_time = time.time() if early_stopping > 500: break #################### # Train #################### train_loss, train_acc = 0.0, 0.0 model.train() out = model(x, edges, train_index) train_loss = F.nll_loss(out, y_train) pred_label = out.max(dim=1)[1] train_acc = acc(pred_label, y_train) opt.zero_grad() train_loss.backward() opt.step() outstrtrain = 'Train loss: %.6f, acc: %.3f' % ( train_loss.detach().item(), train_acc) #################### # Validation #################### train_loss, train_acc = 0.0, 0.0 model.eval() out = model(x, edges, val_index) test_loss = F.nll_loss(out, y_val) pred_label = out.max(dim=1)[1] test_acc = acc(pred_label, y_val) outstrval = ' Test loss: %.6f, acc: %.3f' % ( test_loss.detach().item(), test_acc) duration = "--- %.4f seconds ---" % (time.time() - start_time) log_str = ( "%d / %d epoch" % (epoch, args.epochs)) + outstrtrain + outstrval + duration #print(log_str) log_str_full += log_str + '\n' #################### # Save weights #################### save_perform = test_loss.detach().item() if save_perform <= best_test_err: early_stopping = 0 best_test_err = save_perform torch.save(model.state_dict(), log_path + '/model' + str(i) + '.t7') else: early_stopping += 1 write_log(vars(args), log_path) torch.save(model.state_dict(), log_path + '/model_latest' + str(i) + '.t7') if args.task != 2: #################### # Testing #################### model.load_state_dict( torch.load(log_path + '/model' + str(i) + '.t7')) model.eval() out = model(x, edges, val_index)[:, :2] pred_label = out.max(dim=1)[1] val_acc = acc(pred_label, y_val) out = model(x, edges, test_index)[:, :2] pred_label = out.max(dim=1)[1] test_acc = acc(pred_label, y_test) model.load_state_dict( torch.load(log_path + '/model_latest' + str(i) + '.t7')) model.eval() out = model(x, edges, val_index)[:, :2] pred_label = out.max(dim=1)[1] val_acc_latest = acc(pred_label, y_val) out = model(x, edges, test_index)[:, :2] pred_label = out.max(dim=1)[1] test_acc_latest = acc(pred_label, y_test) #################### # Save testing results #################### log_str = ('val_acc: {val_acc:.4f}, ' + 'test_acc: {test_acc:.4f}, ') log_str1 = log_str.format(val_acc=val_acc, test_acc=test_acc) log_str_full += log_str1 log_str = ('val_acc_latest: {val_acc_latest:.4f}, ' + 'test_acc_latest: {test_acc_latest:.4f}, ') log_str2 = log_str.format(val_acc_latest=val_acc_latest, test_acc_latest=test_acc_latest) log_str_full += log_str2 + '\n' print(log_str1 + log_str2) results[i] = [val_acc, test_acc, val_acc_latest, test_acc_latest] else: model.load_state_dict( torch.load(log_path + '/model' + str(i) + '.t7')) model.eval() out_val = model(x, edges, val_index) out_test = model(x, edges, test_index) [[val_acc_full, val_acc, val_auc, val_f1_micro, val_f1_macro], [test_acc_full, test_acc, test_auc, test_f1_micro, test_f1_macro] ] = link_prediction_evaluation(out_val, out_test, y_val, y_test) model.load_state_dict( torch.load(log_path + '/model_latest' + str(i) + '.t7')) model.eval() out_val = model(x, edges, val_index) out_test = model(x, edges, test_index) [[ val_acc_full_latest, val_acc_latest, val_auc_latest, val_f1_micro_latest, val_f1_macro_latest ], [ test_acc_full_latest, test_acc_latest, test_auc_latest, test_f1_micro_latest, test_f1_macro_latest ]] = link_prediction_evaluation(out_val, out_test, y_val, y_test) #################### # Save testing results #################### log_str = ( 'val_acc_full:{val_acc_full:.4f}, val_acc: {val_acc:.4f}, Val_auc: {val_auc:.4f},' + 'val_f1_micro: {val_f1_micro:.4f}, val_f1_macro: {val_f1_macro:.4f}, ' + 'test_acc_full:{test_acc_full:.4f}, test_acc: {test_acc:.4f}, ' + 'test_f1_micro: {test_f1_micro:.4f}, test_f1_macro: {test_f1_macro:.4f}' ) log_str = log_str.format(val_acc_full=val_acc_full, val_acc=val_acc, val_auc=val_auc, val_f1_micro=val_f1_micro, val_f1_macro=val_f1_macro, test_acc_full=test_acc_full, test_acc=val_acc, test_f1_micro=val_f1_micro, test_f1_macro=val_f1_macro) log_str_full += log_str + '\n' print(log_str) log_str = ( 'val_acc_full_latest:{val_acc_full_latest:.4f}, val_acc_latest: {val_acc_latest:.4f}, Val_auc_latest: {val_auc_latest:.4f},' + 'val_f1_micro_latest: {val_f1_micro_latest:.4f}, val_f1_macro_latest: {val_f1_macro_latest:.4f},' + 'test_acc_full_latest:{test_acc_full_latest:.4f}, test_acc_latest: {test_acc_latest:.4f}, ' + 'test_f1_micro_latest: {test_f1_micro_latest:.4f}, test_f1_macro_latest: {test_f1_macro_latest:.4f}' ) log_str = log_str.format(val_acc_full_latest=val_acc_full_latest, val_acc_latest=val_acc_latest, val_auc_latest=val_auc_latest, val_f1_micro_latest=test_f1_micro_latest, val_f1_macro_latest=val_f1_macro_latest, test_acc_full_latest=test_acc_full_latest, test_acc_latest=val_acc, test_f1_micro_latest=test_f1_micro_latest, test_f1_macro_latest=test_f1_macro_latest) log_str_full += log_str + '\n' print(log_str) results[i] = [ [val_acc_full, val_acc, val_auc, val_f1_micro, val_f1_macro], [ test_acc_full, test_acc, test_auc, test_f1_micro, test_f1_macro ], [ val_acc_full_latest, val_acc_latest, val_auc_latest, val_f1_micro_latest, val_f1_macro_latest ], [ test_acc_full_latest, test_acc_latest, test_auc_latest, test_f1_micro_latest, test_f1_macro_latest ] ] with open(log_path + '/log' + str(i) + '.csv', 'w') as file: file.write(log_str_full) file.write('\n') torch.cuda.empty_cache() return results
def citation_datasets(path="./data", dataset='cora_ml', alpha=0.1, adj_type=None): # path = os.path.join(save_path, dataset) os.makedirs(path, exist_ok=True) dataset_path = os.path.join(path, '{}.npz'.format(dataset)) g = load_npz_dataset(dataset_path) adj, features, labels = g['A'], g['X'], g['z'] # Set new random splits: # * 20 * num_classes labels for training # * 500 labels for validation # * the rest for testing mask = train_test_split(labels, seed=1020, train_examples_per_class=20, val_size=500, test_size=None) mask['train'] = torch.from_numpy(mask['train']).bool() mask['val'] = torch.from_numpy(mask['val']).bool() mask['test'] = torch.from_numpy(mask['test']).bool() coo = adj.tocoo() values = coo.data indices = np.vstack((coo.row, coo.col)) indices = torch.from_numpy(indices).long() features = torch.from_numpy(features.todense()).float() labels = torch.from_numpy(labels).long() if adj_type == 'un': print("Processing to undirected adj") indices = to_undirected(indices) edge_index, edge_weight = get_undirected_adj(indices, features.shape[0], features.dtype) data = Data(x=features, edge_index=edge_index, edge_weight=edge_weight, y=labels) elif adj_type == 'pr': print("Processing pagerank adj matrix") edge_index, edge_weight = get_pr_directed_adj(alpha, indices, features.shape[0], features.dtype) data = Data(x=features, edge_index=edge_index, edge_weight=edge_weight, y=labels) elif adj_type == 'appr': print("Processing approximate personalized pagerank adj matrix") edge_index, edge_weight = get_appr_directed_adj( alpha, indices, features.shape[0], features.dtype) data = Data(x=features, edge_index=edge_index, edge_weight=edge_weight, y=labels) elif adj_type == 'ib': print("Processing first and second-order adj matrix") edge_index, edge_weight = get_appr_directed_adj( alpha, indices, features.shape[0], features.dtype) data = Data(x=features, edge_index=edge_index, edge_weight=edge_weight, y=labels) edge_index, edge_weight = get_second_directed_adj( indices, features.shape[0], features.dtype) data.edge_index2 = edge_index data.edge_weight2 = edge_weight elif adj_type == 'or': print("Processing to original directed adj") data = Data(x=features, edge_index=indices, edge_weight=None, y=labels) else: print("Unsupported adj type.") sys.exit() data.train_mask = mask['train'] data.val_mask = mask['val'] data.test_mask = mask['test'] return data
def main(): parser = argparse.ArgumentParser( description='Link Prediction (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--dataset', type=str, default='ogbl-citation') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--negs', type=int, default=1) parser.add_argument('--gnn_type', type=str, default='gcn') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) print(data.edge_index.shape, data.num_nodes) cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, gnn_type=args.gnn_type).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name=args.dataset) logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): t0 = time.time() loss = train(model, predictor, loader, optimizer, device, args.negs) tt = time.time() print(tt - t0) if epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, 64 * 4 * args.batch_size, device) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT') parser.add_argument( '--raw-text-path', type=str, required=True, help="Path of raw text (.txt file, each raw correspond to a node)") parser.add_argument( '--vectorizer-config-path', type=str, required=True, help="a path to a json file that specify the tfidf hyper-paramters") parser.add_argument('--data-root-dir', type=str, default="./dataset") parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt") parser.add_argument('--dataset', type=str, default="ogbn-arxiv") parser.add_argument('--max-deg', type=int, default=1000) args = parser.parse_args() print(args) # Change args.save_data_dir to args.save_data_dir/args.dataset save_data_dir = os.path.join(args.xrt_data_dir, args.dataset) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_root_dir) data = dataset[0] edge_index = data.edge_index # Make sure edge_index is undirected!!! if not is_undirected(edge_index): edge_index = to_undirected(edge_index) # Filtering nodes whose number of edges >= max_degree Degree = degree(edge_index[0]) Filtered_idx = torch.where(Degree < args.max_deg)[0] print('Number of original nodes:{}'.format(data.x.shape[0])) print('Number of filtered nodes:{}'.format(len(Filtered_idx))) # # Construct and save label matrix (adjacencey matrix) Y. Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index)) Y_csr_trn = Y_csr_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn) smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all) print("Saved Y.trn.npz and Y.all.npz") # Apply the same filtering for raw text with open(args.raw_text_path, "r") as fin: node_text_list = fin.readlines() print("|node_text_list={}".format(len(node_text_list))) count = 0 with open(f"{save_data_dir}/X.trn.txt", "w") as fout: for cur_idx, line in enumerate(node_text_list): if Filtered_idx[count].item() == cur_idx: fout.writelines(line) count += 1 assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format( count, len(Filtered_idx)) print("Saved X.trn.txt") # Apply the same filtering for tfidf features vectorizer_config = Vectorizer.load_config_from_args( args) # using args.vectorizer_config_path preprocessor = Preprocessor.train(node_text_list, vectorizer_config, dtype=np.float32) preprocessor.save(f"{save_data_dir}/tfidf-model") X_tfidf_all = preprocessor.predict(node_text_list) X_tfidf_trn = X_tfidf_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all) smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn) print("Saved X.trn.npz and X.all.npz")
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation2 (NS)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--lr', type=float, default=0.0005) parser.add_argument('--epochs', type=int, default=150) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation2') split_edge = dataset.get_edge_split() data = dataset[0] edge_index = to_undirected(data.edge_index, data.num_nodes) x = data.x.to(device) pos_loader = PositiveLinkNeighborSampler(edge_index, sizes=[15, 10, 5], num_nodes=x.size(0), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) neg_loader = NegativeLinkNeighborSampler(edge_index, sizes=[15, 10, 5], num_nodes=x.size(0), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) subgraph_loader = NeighborSampler(edge_index, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=args.num_workers) # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation2') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, x, pos_loader, neg_loader, optimizer, device) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') if epoch > 49 and epoch % args.eval_steps == 0: result = test(model, predictor, x, subgraph_loader, split_edge, evaluator, batch_size=64 * 1024, device=device) logger.add_result(run, result) train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') print('Neighborsampling') logger.print_statistics(run) print('Neighborsampling') logger.print_statistics()
def process(self): ids, Ns = [], [] for r_path, p_path in zip(self.raw_paths, self.processed_paths): names = glob.glob(osp.join(r_path, '*.gexf')) # Get the graph IDs given by the file name: ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names])) data_list = [] # Convert graphs in .gexf format to a NetworkX Graph: for i, idx in enumerate(ids[-1]): i = i if len(ids) == 1 else i + len(ids[0]) G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf')) mapping = {name: j for j, name in enumerate(G.nodes())} G = nx.relabel_nodes(G, mapping) Ns.append(G.number_of_nodes()) edge_index = torch.tensor(list(G.edges)).t().contiguous() if edge_index.numel() == 0: edge_index = torch.empty((2, 0), dtype=torch.long) edge_index = to_undirected(edge_index, num_nodes=Ns[-1]) data = Data(edge_index=edge_index, i=i) data.num_nodes = Ns[-1] # Create a one-hot encoded feature matrix denoting the atom # type for the AIDS700nef dataset: if self.name == 'AIDS700nef': x = torch.zeros(data.num_nodes, dtype=torch.long) for node, info in G.nodes(data=True): x[int(node)] = self.types.index(info['type']) data.x = F.one_hot(x, num_classes=len(self.types)).to( torch.float) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), p_path) assoc = {idx: i for i, idx in enumerate(ids[0])} assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])}) path = osp.join(self.raw_dir, self.name, 'ged.pickle') mat = torch.full((len(assoc), len(assoc)), float('inf')) with open(path, 'rb') as f: obj = pickle.load(f) xs, ys, gs = [], [], [] for (x, y), g in obj.items(): xs += [assoc[x]] ys += [assoc[y]] gs += [g] x, y = torch.tensor(xs), torch.tensor(ys) g = torch.tensor(gs, dtype=torch.float) mat[x, y], mat[y, x] = g, g path = osp.join(self.processed_dir, f'{self.name}_ged.pt') torch.save(mat, path) # Calculate the normalized GEDs: N = torch.tensor(Ns, dtype=torch.float) norm_mat = mat / (0.5 * (N.view(-1, 1) + N.view(1, -1))) path = osp.join(self.processed_dir, f'{self.name}_norm_ged.pt') torch.save(norm_mat, path)
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)') parser.add_argument('--device', type=int, default=1) parser.add_argument('--log_steps', type=int, default=10) parser.add_argument('--num_layers', type=int, default=16) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--weight_decay', type=float, default=0, help='weight decay (L2 loss on parameters).') parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--patience', type=int, default=200, help='patience') parser.add_argument('--alpha', type=float, default=0.5, help='alpha_l') parser.add_argument('--norm', default='bn', help='norm layer.') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() data = dataset[0] data = data.to(device) train_idx = split_idx['train'].to(device) data.edge_index = to_undirected(data.edge_index, data.num_nodes) Net = GCNIIdense_model evaluator = Evaluator(name='ogbn-arxiv') acc_list = [] for run in range(args.runs): model = Net(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, args.alpha, args.norm).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) bad_counter = 0 best_val = 0 final_test_acc = 0 for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, data.y, split_idx, evaluator) train_acc, valid_acc, test_acc = result if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') if valid_acc > best_val: best_val = valid_acc final_test_acc = test_acc bad_counter = 0 else: bad_counter += 1 if bad_counter == args.patience: break acc_list.append(final_test_acc * 100) print(run + 1, ':', acc_list[-1]) acc_list = torch.tensor(acc_list) print(f'Avg Test: {acc_list.mean():.2f} ± {acc_list.std():.2f}')
def get_small_dataset(dataset_name, normalize_attributes=False, add_self_loops=False, remove_isolated_nodes=False, make_undirected=False, graph_availability=None, seed=0, create_adjacency_lists=True): """ Get the pytorch_geometric.data.Data object associated with the specified dataset name. :param dataset_name: str => One of the datasets mentioned below. :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1. :param add_self_loops: Add self loops to the input Graph. :param remove_isolated_nodes: Remove isolated nodes. :param make_undirected: Make the Graph undirected. :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available during training. Otherwise, only training split nodes are available. :param seed: The random seed to use while splitting into train/val/test splits. :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient r-radius neighborhood sampling. :return: A pytorch_geometric.data.Data object for that dataset. """ assert dataset_name in { 'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs', 'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit' } assert graph_availability in {'inductive', 'transductive'} # Compose transforms that should be applied. transforms = [] if normalize_attributes: transforms.append(NormalizeFeatures()) if remove_isolated_nodes: transforms.append(RemoveIsolatedNodes()) if add_self_loops: transforms.append(AddSelfLoops()) transforms = Compose(transforms) if transforms else None # Load the specified dataset and apply transforms. root_dir = '/tmp/{dir}'.format(dir=dataset_name) processed_dir = os.path.join(root_dir, dataset_name, 'processed') # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again. if os.path.exists(processed_dir) and os.path.isdir(processed_dir): shutil.rmtree(processed_dir) data = None def split_function(y): return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed) if dataset_name in ['citeseer', 'cora', 'pubmed']: data = Planetoid(root=root_dir, name=dataset_name, pre_transform=transforms, split='full').data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'cora-full': data = CoraFull(root=root_dir, pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-computers': data = Amazon(root=root_dir, name='Computers', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-photo': data = Amazon(root=root_dir, name='Photo', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-cs': data = Coauthor(root=root_dir, name='CS', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-physics': data = Coauthor(root=root_dir, name='Physics', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'reddit': data = Reddit(root=root_dir, pre_transform=transforms).data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'ppi': data = SimpleNamespace() data.graphs = [] for split in ['train', 'val', 'test']: split_data = PPI(root=root_dir, split=split, pre_transform=transforms) x_idxs = split_data.slices['x'].numpy() edge_idxs = split_data.slices['edge_index'].numpy() split_data = split_data.data for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:], edge_idxs, edge_idxs[1:]): graph = Data(split_data.x[x_start:x_end], split_data.edge_index[:, e_start:e_end], y=split_data.y[x_start:x_end]) graph.num_nodes = int(x_end - x_start) graph.split = split all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.train_mask = all_true if split == 'train' else all_false graph.val_mask = all_true if split == 'val' else all_false graph.test_mask = all_true if split == 'test' else all_false data.graphs.append(graph) if seed != 0: temp_random = random.Random(seed) val_graphs = temp_random.sample(range(len(data.graphs)), 2) test_candidates = [ graph_idx for graph_idx in range(len(data.graphs)) if graph_idx not in val_graphs ] test_graphs = temp_random.sample(test_candidates, 2) for graph_idx, graph in enumerate(data.graphs): all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train' graph.train_mask = all_true if graph.split == 'train' else all_false graph.val_mask = all_true if graph.split == 'val' else all_false graph.test_mask = all_true if graph.split == 'test' else all_false if make_undirected: for graph in data.graphs: graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes) LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).') # Populate adjacency lists for efficient k-neighborhood sampling. # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists. LOG.info('Processing adjacency lists and degree information.') for graph in data.graphs: train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) adjacency_lists = defaultdict(list) not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy() val_mask = graph.val_mask.numpy() test_mask = graph.test_mask.numpy() if create_adjacency_lists: num_edges = graph.edge_index[0].shape[0] sources, dests = graph.edge_index[0].numpy( ), graph.edge_index[1].numpy() for source, dest in tqdm(zip(sources, dests), total=num_edges, leave=False): if not_val_test_mask[dest] and not_val_test_mask[source]: train_in_degrees[dest] += 1 val_in_degrees[dest] += 1 elif val_mask[dest] and not test_mask[source]: val_in_degrees[dest] += 1 test_in_degrees[dest] += 1 adjacency_lists[dest].append(source) graph.adjacency_lists = dict(adjacency_lists) graph.train_in_degrees = torch.from_numpy(train_in_degrees).long() graph.val_in_degrees = torch.from_numpy(val_in_degrees).long() graph.test_in_degrees = torch.from_numpy(test_in_degrees).long() if graph_availability == 'transductive': graph.train_in_degrees = data.test_in_degrees graph.val_in_degrees = data.test_in_degrees graph.graph_availability = graph_availability # To accumulate any neighborhood perturbations to the graph. graph.perturbed_neighborhoods = defaultdict(set) graph.added_nodes = defaultdict(set) graph.modified_degrees = {} # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries. graph.use_cache = True graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3) graph.train_mask_original = graph.train_mask graph.val_mask_original = graph.val_mask graph.test_mask_original = graph.test_mask graph.train_mask = torch.ones( graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask return data
def train_edges(data, mask_node, val_ratio=0.05, test_ratio=0.1): device = data.x.device num_nodes = data.num_nodes row, col = data.total_edge_index mask = row < col row, col = row[mask], col[mask] # Select train nodes edge_row, edge_col = row, col size = len(mask_node) # print(size) indice_size = 0 for i in range(size): r_index = (row == mask_node[i]).nonzero() c_index = (col == mask_node[i]).nonzero() index = torch.unique(torch.cat((r_index, c_index), 0)) if (i == 0): indice = index else: indice = torch.cat((indice, index), 0) # train_indice = indice.squeeze() train_indice = torch.unique(indice).squeeze() t_r, t_c = row[train_indice], col[train_indice] data.train_pos_edge_index = torch.stack([t_r, t_c], dim=0) edge_mask = torch.Tensor(edge_row.size(0)).type(torch.bool).to( data.x.device) edge_mask[edge_mask < 1] = True edge_mask = edge_mask.scatter_(0, train_indice, False) edge_row = edge_row[edge_mask] edge_col = edge_col[edge_mask] edge_index = torch.stack((edge_row, edge_col), dim=0) # Negative edges. neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8) neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool) neg_adj_mask[row, col] = 0 #inverse adj made neg_row, neg_col = neg_adj_mask.nonzero().t() # train negative for ind, i in enumerate(mask_node): r_index = (neg_row == i).nonzero() c_index = (neg_col == i).nonzero() index = torch.unique(torch.cat((r_index, c_index), 0)) if (ind == 0): indice = index else: indice = torch.cat((indice, index), 0) neg_train_indice = torch.unique(indice).squeeze() if (train_indice.dim() == 0): indice_size = 1 else: indice_size = train_indice.size(0) perm_train = random.sample(range(neg_train_indice.size(0)), indice_size) perm_train = torch.tensor(perm_train).to(torch.long).sort()[0] neg_t_r, neg_t_c = neg_row[perm_train], neg_col[perm_train] data.train_neg_edge_index = torch.stack([neg_t_r, neg_t_c], dim=0).to(device) data.train_edge_index = to_undirected(edge_index, mask_node, num_nodes) return data
def negative_sampling(edge_index, num_nodes=None, num_neg_samples=None, force_undirected=False): r"""Samples random negative edges of a graph given by :attr:`edge_index`. Args: edge_index (LongTensor): The edge indices. num_nodes (int, optional): The number of nodes, *i.e.* :obj:`max_val + 1` of :attr:`edge_index`. (default: :obj:`None`) num_neg_samples (int, optional): The number of negative samples to return. If set to :obj:`None`, will try to return a negative edge for every positive edge. (default: :obj:`None`) force_undirected (bool, optional): If set to :obj:`True`, sampled negative edges will be undirected. (default: :obj:`False`) :rtype: LongTensor """ num_nodes = maybe_num_nodes(edge_index, num_nodes) num_neg_samples = num_neg_samples or edge_index.size(1) # Handle '|V|^2 - |E| < |E|' case for G = (V, E). num_neg_samples = min(num_neg_samples, num_nodes * num_nodes - edge_index.size(1)) if force_undirected: num_neg_samples = num_neg_samples // 2 # Upper triangle indices: N + ... + 1 = N (N + 1) / 2 rng = range((num_nodes * (num_nodes + 1)) // 2) # Remove edges in the lower triangle matrix. row, col = edge_index mask = row <= col row, col = row[mask], col[mask] # idx = N * i + j - i * (i+1) / 2 idx = (row * num_nodes + col - row * (row + 1) // 2).to('cpu') else: rng = range(num_nodes**2) # idx = N * i + j idx = (edge_index[0] * num_nodes + edge_index[1]).to('cpu') perm = torch.tensor(random.sample(rng, num_neg_samples)) mask = torch.from_numpy(np.isin(perm, idx)).to(torch.bool) rest = mask.nonzero().view(-1) while rest.numel() > 0: # pragma: no cover tmp = torch.tensor(random.sample(rng, rest.size(0))) mask = torch.from_numpy(np.isin(tmp, idx)).to(torch.bool) perm[rest] = tmp rest = rest[mask.nonzero().view(-1)] if force_undirected: # (-sqrt((2 * N + 1)^2 - 8 * perm) + 2 * N + 1) / 2 row = torch.floor((-torch.sqrt((2. * num_nodes + 1.)**2 - 8. * perm) + 2 * num_nodes + 1) / 2) col = perm - row * (2 * num_nodes - row - 1) // 2 neg_edge_index = torch.stack([row, col], dim=0).long() neg_edge_index = to_undirected(neg_edge_index) else: row = perm / num_nodes col = perm % num_nodes neg_edge_index = torch.stack([row, col], dim=0).long() return neg_edge_index.to(edge_index.device)
def process(self): import networkx as nx ids, Ns = [], [] # Iterating over paths for raw and processed data (train + test): for r_path, p_path in zip(self.raw_paths, self.processed_paths): # Find the paths of all raw graphs: names = glob.glob(osp.join(r_path, '*.gexf')) # Get sorted graph IDs given filename: 123.gexf -> 123 ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names])) data_list = [] # Convert graphs in .gexf format to a NetworkX Graph: for i, idx in enumerate(ids[-1]): i = i if len(ids) == 1 else i + len(ids[0]) # Reading the raw `*.gexf` graph: G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf')) # Mapping of nodes in `G` to a contiguous number: mapping = {name: j for j, name in enumerate(G.nodes())} G = nx.relabel_nodes(G, mapping) Ns.append(G.number_of_nodes()) edge_index = torch.tensor(list(G.edges)).t().contiguous() if edge_index.numel() == 0: edge_index = torch.empty((2, 0), dtype=torch.long) edge_index = to_undirected(edge_index, num_nodes=Ns[-1]) data = Data(edge_index=edge_index, i=i) data.num_nodes = Ns[-1] # Create a one-hot encoded feature matrix denoting the atom # type (for the `AIDS700nef` dataset): if self.name == 'AIDS700nef': x = torch.zeros(data.num_nodes, dtype=torch.long) for node, info in G.nodes(data=True): x[int(node)] = self.types.index(info['type']) data.x = F.one_hot(x, num_classes=len(self.types)).to( torch.float) if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) data_list.append(data) torch.save(self.collate(data_list), p_path) assoc = {idx: i for i, idx in enumerate(ids[0])} assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])}) # Extracting ground-truth GEDs from the GED pickle file path = osp.join(self.raw_dir, self.name, 'ged.pickle') # Initialize GEDs as float('inf'): mat = torch.full((len(assoc), len(assoc)), float('inf')) with open(path, 'rb') as f: obj = pickle.load(f) xs, ys, gs = [], [], [] for (x, y), g in obj.items(): xs += [assoc[x]] ys += [assoc[y]] gs += [g] # The pickle file does not contain GEDs for test graph pairs, i.e. # GEDs for (test_graph, test_graph) pairs are still float('inf'): x, y = torch.tensor(xs), torch.tensor(ys) ged = torch.tensor(gs, dtype=torch.float) mat[x, y], mat[y, x] = ged, ged path = osp.join(self.processed_dir, f'{self.name}_ged.pt') torch.save(mat, path) # Calculate the normalized GEDs: N = torch.tensor(Ns, dtype=torch.float) norm_mat = mat / (0.5 * (N.view(-1, 1) + N.view(1, -1))) path = osp.join(self.processed_dir, f'{self.name}_norm_ged.pt') torch.save(norm_mat, path)
def data_downloader(dataset='Cora', data_dir='../data', data_type='static'): ''' グラフデータをダウンロードする. Parameters: dataset (:obj:`str`): データセット名.'Cora', 'CiteSeer', 'factset' Returens: data (torch_geometric.data.Data): グラフデータ. ''' if dataset in ['Cora', 'CiteSeer', 'PubMed']: data = Planetoid(data_dir, dataset, transform=T.NormalizeFeatures())[0] elif 'Factset' in dataset: year = dataset[-4:] print(f'processing Factset in year {year}.') if data_type == 'dynamic': df = pd.read_csv( data_dir + f'/Factset/node_features_{year}_dynamic_processed.csv' ).drop_duplicates(ignore_index=True, subset='code') else: df = pd.read_csv(data_dir + f'/Factset/node_features_{year}_processed.csv' ).drop_duplicates(ignore_index=True, subset='code') N = len(df) # ノード数 # sec_codeとノード番号の対応付け dic = {} for row in df.itertuples(): dic[row[1]] = row[0] edge = pd.read_csv(data_dir + f'/Factset/edges_{year}.csv', usecols=[ 'REL_TYPE', 'SOURCE_COMPANY_TICKER', 'TARGET_COMPANY_TICKER' ]).rename(columns={ 'SOURCE_COMPANY_TICKER': 'source', 'TARGET_COMPANY_TICKER': 'target' }) edge = edge[(edge['REL_TYPE'] == 'CUSTOMER') | (edge['REL_TYPE'] == 'SUPPLIER')] edge = edge[['source', 'target']].drop_duplicates(ignore_index=True, subset=['source', 'target']) for i in range(edge.shape[0]): if i in edge.index: source = edge.loc[i, 'source'] target = edge.loc[i, 'target'] edge = edge.drop(edge[(edge['source'] == target) & (edge['target'] == source)].index) edge = edge.applymap(lambda x: dic[x] if x in dic.keys() else np.nan) edge = edge.dropna(how='any').reset_index(drop=True) # 欠損値の処理 df = df.iloc[:, 5:] # sec_codeは除く # df = df.dropna(thresh=100, axis=1) # NaNでないデータがthresh個以上なら削除しない df = df.fillna(0) # その他の列は平均で補完 df = (df - df.mean()) / df.std() df = df.fillna(0) # X to tensor X = [[] for _ in range(N)] for row in df.itertuples(): X[row[0]] = row[1:] X = torch.tensor(X, dtype=torch.float) # edge_index to tensor edge_index = torch.tensor(edge.to_numpy().T, dtype=torch.long) # torch_geometric.data.Data data = Data(x=X, edge_index=edge_index) print(f'dataset {dataset} has been downloaded.') print(f'is undirected: {data.is_undirected()}') print(f'contains self loops: {data.contains_self_loops()}') print(f'num_nodes: {data.num_nodes}') print(f'num_edges: {data.num_edges}\n') if data.is_undirected() is False: data.edge_index = to_undirected(data.edge_index) print('The graph has been transformed into undirected one.') return data
def main(): parser = argparse.ArgumentParser(description='OGBN-papers100M (SIGN)') parser.add_argument('--file_name', type=str, default="test") parser.add_argument('--undirected_num_propagations', type=int, default=3) parser.add_argument('--directed_num_propagations', type=int, default=3) parser.add_argument('--undirected_dropedge_rate', type=float, default=0.4) parser.add_argument('--directed_dropedge_rate', type=float, default=0.2) parser.add_argument('--undirected', action='store_true') parser.add_argument('--directed', action='store_true') parser.add_argument('--undirected_asymm_norm', action='store_true') parser.add_argument('--directed_asymm_norm', action='store_true') parser.add_argument('--undirected_remove_diag', action='store_true') parser.add_argument('--undirected_set_diag', action='store_true') parser.add_argument('--directed_remove_diag', action='store_true') parser.add_argument('--directed_set_diag', action='store_true') args = parser.parse_args() if not args.directed and not args.undirected: raise ValueError( 'Please specify whether you want to use undirected or directed operators (or both).' ) # pre-processing ###################################################### dataset = PygNodePropPredDataset('ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x.numpy() N = data.num_nodes train_idx, valid_idx, test_idx = split_idx['train'], split_idx[ 'valid'], split_idx['test'] all_idx = torch.cat([train_idx, valid_idx, test_idx]) mapped_train_idx = torch.arange(len(train_idx)) mapped_valid_idx = torch.arange(len(train_idx), len(train_idx) + len(valid_idx)) mapped_test_idx = torch.arange( len(train_idx) + len(valid_idx), len(train_idx) + len(valid_idx) + len(test_idx)) op_dict = {} op_dict['label'] = data.y.data[all_idx].to(torch.long) op_dict['split_idx'] = { 'train': mapped_train_idx, 'valid': mapped_valid_idx, 'test': mapped_test_idx } op_dict['op_embedding'] = [] op_dict['op_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) print('Start processing') if args.undirected: # preprocess undirected operators print('Preparing undirected operators...') # subsample operator print('Subsampling (dropping {} %)'.format( 100 * args.undirected_dropedge_rate)) edge_index, _ = dropout_adj(data.edge_index, p=args.undirected_dropedge_rate, num_nodes=data.num_nodes) # to undirected print('Making the graph undirected') edge_index = to_undirected(edge_index, data.num_nodes) row, col = edge_index # get adj print('Getting adj matrix') adj = get_adj(row, col, N, asymm_norm=args.undirected_asymm_norm, set_diag=args.undirected_set_diag, remove_diag=args.undirected_remove_diag) # preprocessing of features print('Diffusing node features') x = data.x.numpy() for _ in tqdm(range(args.undirected_num_propagations)): x = adj @ x op_dict['op_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) if args.directed: # preprocess directed operators print('Preparing directed operators...') # subsample operator print('Subsampling (dropping {} %)'.format( 100 * args.directed_dropedge_rate)) edge_index, _ = dropout_adj(data.edge_index, p=args.directed_dropedge_rate, num_nodes=data.num_nodes) row, col = edge_index # get adj print('Getting adj matrix') adj = get_adj(row, col, N, asymm_norm=args.directed_asymm_norm, set_diag=args.directed_set_diag, remove_diag=args.directed_remove_diag) # preprocessing of features print('Diffusing node features') x = data.x.numpy() for _ in tqdm(range(args.directed_num_propagations)): x = adj @ x op_dict['op_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) # get adj print('Getting transpose adj matrix') adj = get_adj(col, row, N, asymm_norm=args.directed_asymm_norm, set_diag=args.directed_set_diag, remove_diag=args.directed_remove_diag) # preprocessing of features print('Diffusing node features') x = data.x.numpy() for _ in tqdm(range(args.directed_num_propagations)): x = adj @ x op_dict['op_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) torch.save(op_dict, '{}.pt'.format(args.file_name))
print(data) edge_index_dict = data.edge_index_dict # We need to add reverse edges to the heterogeneous graph. r, c = edge_index_dict[('author', 'affiliated_with', 'institution')] edge_index_dict[('institution', 'to', 'author')] = torch.stack([c, r]) r, c = edge_index_dict[('author', 'writes', 'paper')] edge_index_dict[('paper', 'to', 'author')] = torch.stack([c, r]) r, c = edge_index_dict[('paper', 'has_topic', 'field_of_study')] edge_index_dict[('field_of_study', 'to', 'paper')] = torch.stack([c, r]) # Convert to undirected paper <-> paper relation. edge_index = to_undirected(edge_index_dict[('paper', 'cites', 'paper')]) edge_index_dict[('paper', 'cites', 'paper')] = edge_index if not os.path.exists(args.feat_dir): os.mkdir(args.feat_dir) ###### for field_of_study print('###### for field_of_study') rows = edge_index_dict[('field_of_study', 'to', 'paper')][0] cols = edge_index_dict[('field_of_study', 'to', 'paper')][1] v = torch.ones(rows.size()) m, n = data.num_nodes_dict['field_of_study'], data.num_nodes_dict['paper'] y = data.x_dict['paper'] out = gen_features(rows, cols, v, m, n, y) np.save(f'{args.feat_dir}/field_of_study_FEAT.npy', out) ###### for author
def train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1): r"""Splits the edges of a :obj:`torch_geometric.data.Data` object into positive and negative train/val/test edges, and adds attributes of `train_pos_edge_index`, `train_neg_adj_mask`, `val_pos_edge_index`, `val_neg_edge_index`, `test_pos_edge_index`, and `test_neg_edge_index` to :attr:`data`. Args: data (Data): The data object. val_ratio (float, optional): The ratio of positive validation edges. (default: :obj:`0.05`) test_ratio (float, optional): The ratio of positive test edges. (default: :obj:`0.1`) :rtype: :class:`torch_geometric.data.Data` """ assert 'batch' not in data # No batch-mode. num_nodes = data.num_nodes row, col = data.edge_index data.edge_index = None # Return upper triangular portion. mask = row < col row, col = row[mask], col[mask] n_v = int(math.floor(val_ratio * row.size(0))) n_t = int(math.floor(test_ratio * row.size(0))) # Positive edges. perm = torch.randperm(row.size(0)) row, col = row[perm], col[perm] r, c = row[:n_v], col[:n_v] data.val_pos_edge_index = torch.stack([r, c], dim=0) r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t] data.test_pos_edge_index = torch.stack([r, c], dim=0) r, c = row[n_v + n_t:], col[n_v + n_t:] data.train_pos_edge_index = torch.stack([r, c], dim=0) data.train_pos_edge_index = to_undirected(data.train_pos_edge_index) # Negative edges. neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8) neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool) neg_adj_mask[row, col] = 0 neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t() perm = random.sample(range(neg_row.size(0)), min(n_v + n_t, neg_row.size(0))) perm = torch.tensor(perm) perm = perm.to(torch.long) neg_row, neg_col = neg_row[perm], neg_col[perm] neg_adj_mask[neg_row, neg_col] = 0 data.train_neg_adj_mask = neg_adj_mask row, col = neg_row[:n_v], neg_col[:n_v] data.val_neg_edge_index = torch.stack([row, col], dim=0) row, col = neg_row[n_v:n_v + n_t], neg_col[n_v:n_v + n_t] data.test_neg_edge_index = torch.stack([row, col], dim=0) return data
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GAT Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument("--num-layers", type=int, default=3, help="number of hidden layers") parser.add_argument("--lr", type=float, default=0.0029739421726400865, help="learning rate") parser.add_argument('--weight-decay', type=float, default=2.4222556964495987e-05, help="weight decay") parser.add_argument("--num-hidden", type=int, default=16, help="number of hidden units") parser.add_argument("--dropout", type=float, default=0.18074706609292976, help="Dropout to use") parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument("--eval", action='store_true', help='If not set, we will only do the training part.') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0)) model = GAT(num_layers=args.num_layers, in_feats=data.x.size(-1), num_hidden=args.num_hidden, num_classes=dataset.num_classes, heads=[4, 4, 4], dropout=args.dropout).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) dur = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) for epoch in range(1, 1 + args.epochs): t0 = time.time() loss = train(model, x, edge_index, y_true, train_idx, optimizer) if epoch >= 3: dur.append(time.time() - t0) print('Training time/epoch {}'.format(np.mean(dur))) if not args.eval: continue result = test(model, x, edge_index, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') if args.eval: logger.print_statistics(run) if args.eval: logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)') parser.add_argument('--num_propagations', type=int, default=3) parser.add_argument('--dropedge_rate', type=float, default=0.4) args = parser.parse_args() # SGC pre-processing ###################################################### dataset = PygNodePropPredDataset('ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x.numpy() N = data.num_nodes print('Making the graph undirected.') ### Randomly drop some edges to save computation data.edge_index, _ = dropout_adj(data.edge_index, p=args.dropedge_rate, num_nodes=data.num_nodes) data.edge_index = to_undirected(data.edge_index, data.num_nodes) print(data) row, col = data.edge_index print('Computing adj...') adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N)) adj = adj.set_diag() deg = adj.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) adj = adj.to_scipy(layout='csr') train_idx, valid_idx, test_idx = split_idx['train'], split_idx[ 'valid'], split_idx['test'] all_idx = torch.cat([train_idx, valid_idx, test_idx]) mapped_train_idx = torch.arange(len(train_idx)) mapped_valid_idx = torch.arange(len(train_idx), len(train_idx) + len(valid_idx)) mapped_test_idx = torch.arange( len(train_idx) + len(valid_idx), len(train_idx) + len(valid_idx) + len(test_idx)) sgc_dict = {} sgc_dict['label'] = data.y.data[all_idx].to(torch.long) sgc_dict['split_idx'] = { 'train': mapped_train_idx, 'valid': mapped_valid_idx, 'test': mapped_test_idx } sgc_dict['sgc_embedding'] = [] sgc_dict['sgc_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) print('Start SGC processing') for _ in tqdm(range(args.num_propagations)): x = adj @ x sgc_dict['sgc_embedding'].append( torch.from_numpy(x[all_idx]).to(torch.float)) print(sgc_dict) torch.save(sgc_dict, 'sgc_dict.pt')
def main(): args = ArgsInit().save_exp() if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") else: device = torch.device('cpu') dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] sub_dir = 'SL_{}'.format(args.self_loop) args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes model = DeeperGCN(args).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0} start_time = time.time() test_accuracy = 0.0 for epoch in range(1, args.epochs + 1): epoch_loss = train(model, x, edge_index, y_true, train_idx, optimizer, args) logging.info('Epoch {}, training loss {:.4f}'.format(epoch, epoch_loss)) model.print_params(epoch=epoch) result = test(model, x, edge_index, y_true, split_idx, evaluator) logging.info(result) train_accuracy, valid_accuracy, test_accuracy = result if train_accuracy > results['highest_train']: results['highest_train'] = train_accuracy if valid_accuracy > results['highest_valid']: results['highest_valid'] = valid_accuracy results['final_train'] = train_accuracy results['final_test'] = test_accuracy #save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='valid_best') print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'Epoch:[{}/{}]\t Results LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}]' .format(epoch, args.epochs, epoch_loss, train_accuracy * 100, valid_accuracy * 100, test_accuracy * 100, results['final_test'] * 100)) #save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='last_epoch') end_time = time.time() total_time = end_time - start_time logging.info('Total time: {}'.format(time.strftime('%H:%M:%S', time.gmtime(total_time)))) print('-' * 100) print("syd : Final Result Train:[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}]" .format(results['final_train'] * 100, results['highest_valid'] * 100, results['final_test'] * 100)) print('-' * 100)
def run(RunnerObj, fID): ''' Function to run GCN algorithm Requires the decoder parameter ''' rSeed = RunnerObj.randSeed torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False random.seed(rSeed) np.random.seed(rSeed) torch.manual_seed(rSeed) torch.cuda.manual_seed(rSeed) def train(): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_only_edge_index, train_neg_edge_index) loss.backward() optimizer.step() return (loss) def test(pos_edge_index, neg_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) yTrue, yPred = model.test(z, pos_edge_index, neg_edge_index) return yTrue, yPred#z, epr, ap, pred, act def val(pos_edge_index, neg_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, pos_edge_index, neg_edge_index) return loss print("\n Running fold: ", fID) start = time.process_time() early_stopping = EarlyStopping(patience=100) read_time = time.process_time() print("Reading necessary input files...") exprDF = pd.read_csv(RunnerObj.inputDir.joinpath("normExp.csv"), header = 0, index_col =0) posE = np.load(RunnerObj.inputDir.joinpath("posE.npy")) negE = np.load(RunnerObj.inputDir.joinpath("negE.npy")) nodeDict = np.load(RunnerObj.inputDir.joinpath("nodeDict.npy"), allow_pickle = True) geneTFDict = np.load(RunnerObj.inputDir.joinpath("GeneTFs.npy"), allow_pickle = True) onlyGenes = geneTFDict.item().get('Gene') onlyTFs = geneTFDict.item().get('TF') foldData = np.load(RunnerObj.inputDir.joinpath("{}CV/fold-".format(RunnerObj.CVType)+str(RunnerObj.randSeed)+"-"+str(fID)+".npy"), allow_pickle = True) train_posIdx = foldData.item().get('train_posIdx') test_posIdx = foldData.item().get('test_posIdx') train_negIdx = foldData.item().get('train_negIdx') test_negIdx = foldData.item().get('test_negIdx') print("Done reading inputs...") logging.info("Reading input files took %.3f seconds" %(time.process_time()-read_time)) setup_time = time.process_time() val_posIdx = random.sample(list(train_posIdx), int(0.1*len(train_posIdx))) train_posIdx = list(set(train_posIdx).difference(set(val_posIdx))) val_negIdx = random.sample(list(train_negIdx), int(0.1*len(train_negIdx))) train_negIdx = list(set(train_negIdx).difference(set(val_negIdx))) #print(val_posIdx,val_negIdx) sourceNodes = posE[train_posIdx , 0] targetNodes = posE[train_posIdx , 1] #Additionally, create a copy of sourceNodes and targetNodes #which would contain only the nodes present in the network without additional dummy edges sourceNodesCPY = posE[train_posIdx , 0] targetNodesCPY = posE[train_posIdx , 1] presentNodesSet = set(sourceNodes).union(set(targetNodes)) allNodes = set(nodeDict.item().keys()) missingSet = allNodes.difference(presentNodesSet) presentNodes = np.array(list(presentNodesSet)) missingNodes = np.array(list(missingSet)) missingTFs = np.array(list(missingSet.intersection(set(onlyTFs)))) presentTFs = np.array(list(set(sourceNodes))) #print(len(missing)*len(presentTF)+len(sourceNodes)+len(missingTF)*len(allNodes)) # For missing TFs, additionally add edges outgoing to present nodes for tf in missingTFs: for node in presentNodes: sourceNodes = np.append(sourceNodes, tf) targetNodes = np.append(targetNodes, node) # find unlinked genes and TFs and have incoming edges from all TFs # Add edges from every TF to every gene that is missing from the network. # This step helps to connect these genes to the network so that we can transfer information from TFs to these genes # and potentially compute better embeddings for these genes. # Note: This is one of the ways to have them be part of the graph if RunnerObj.params['reconnect_disconnected_nodes']: for node in missingNodes: for tf in onlyTFs: sourceNodes = np.append(sourceNodes, tf) targetNodes = np.append(targetNodes, node) nodeFeatures = torch.Tensor(exprDF.values) if RunnerObj.params['encoder'] == 'GCN': eIndex = to_undirected(torch.LongTensor([sourceNodes, targetNodes])) elif RunnerObj.params['encoder'] == 'DGCN': eIndex = torch.LongTensor([sourceNodes, targetNodes]) else: print("Invalid encoder name: ", RunnerObj.params.encoder) sys.exit() data = Data(x=nodeFeatures, edge_index=eIndex) if RunnerObj.params['encoder'] == 'GCN': data.train_pos_edge_index = to_undirected(torch.stack([torch.LongTensor(sourceNodes), torch.LongTensor(targetNodes)], dim=0)) data.train_pos_only_edge_index = torch.stack([torch.LongTensor(sourceNodesCPY), torch.LongTensor(targetNodesCPY)], dim=0) elif RunnerObj.params['encoder'] == 'DGCN': data.train_pos_edge_index = torch.stack([torch.LongTensor(sourceNodes), torch.LongTensor(targetNodes)], dim=0) data.train_pos_only_edge_index = torch.stack([torch.LongTensor(sourceNodesCPY), torch.LongTensor(targetNodesCPY)], dim=0) else: print("Invalid encoder name: ", RunnerObj.params.encoder) sys.exit() data.test_pos_edge_index = torch.stack([torch.LongTensor(posE[test_posIdx,0]), torch.LongTensor(posE[test_posIdx,1])], dim=0) data.val_pos_edge_index = torch.stack([torch.LongTensor(posE[val_posIdx,0]), torch.LongTensor(posE[val_posIdx,1])], dim=0) data.train_neg_edge_index = torch.stack([torch.LongTensor(negE[train_negIdx,0]), torch.LongTensor(negE[train_negIdx,1])], dim=0) data.test_neg_edge_index = torch.stack([torch.LongTensor(negE[test_negIdx,0]), torch.LongTensor(negE[test_negIdx,1])], dim=0) data.val_neg_edge_index = torch.stack([torch.LongTensor(negE[val_negIdx,0]), torch.LongTensor(negE[val_negIdx,1])], dim=0) print("Done setting up data structures...") logging.info("Setting up data structures took %.3f seconds" %(time.process_time()-setup_time)) channels = RunnerObj.params['channels'] dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') h_sizes = [data.num_features] if RunnerObj.params['hidden'] >= 1: for i in reversed(range(RunnerObj.params['hidden']-1)): h_sizes.append((i+2)*channels) h_sizes.append(channels) #model = kwargs[modelName](Encoder(data.num_features, channels)).to(dev) if RunnerObj.params['decoder'] == 'IP': model = GAEwithK(Encoder(h_sizes)).to(dev) elif RunnerObj.params['decoder'] == 'NW': model = GAEwithK(Encoder(h_sizes), TFDecoder(data.num_nodes, onlyTFs)).to(dev) elif RunnerObj.params['decoder'] == 'RS': model = GAEwithK(Encoder(h_sizes), RESCALDecoder(channels)).to(dev) else: print("Invalid decoder name:", RunnerObj.params['decoder']) sys.exit() x = data.x.to(dev) train_pos_edge_index = data.train_pos_edge_index.to(dev) train_pos_only_edge_index = data.train_pos_only_edge_index.to(dev) train_neg_edge_index = data.train_neg_edge_index.to(dev) test_pos_edge_index, test_neg_edge_index = data.test_pos_edge_index.to(dev), data.test_neg_edge_index.to(dev) val_pos_edge_index, val_neg_edge_index = data.val_pos_edge_index.to(dev), data.val_neg_edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) lossDict = {'epoch':[],'TrLoss':[], 'valLoss': []} last10Models = [] print("Running %s-%s..." %(RunnerObj.params['encoder'],RunnerObj.params['decoder'])) if not os.path.exists(RunnerObj.outPrefix): os.mkdir(RunnerObj.outPrefix) fullPath = Path(str(RunnerObj.outPrefix) + '/randID-' + str(RunnerObj.randSeed) + '/' + RunnerObj.params['encoder'] + '-' +RunnerObj.params['decoder']) if not os.path.exists(fullPath): os.makedirs(fullPath) training_summary_path = os.path.join(fullPath, 'trainingSummary', 'hiddenlayer-'+str(RunnerObj.params['hidden'])) if not os.path.exists(training_summary_path): os.makedirs(training_summary_path) writer = SummaryWriter(os.path.join(training_summary_path, 'fold-'+str(fID))) for epoch in tqdm(range(1, RunnerObj.params['epochs'])): TrLoss = train() valLoss = val(val_pos_edge_index, val_neg_edge_index) lossDict['epoch'].append(epoch) lossDict['TrLoss'].append(TrLoss.item()) lossDict['valLoss'].append(valLoss.item()) #print(TrLoss.item(), valLoss.item()) writer.add_scalar("TrainingLoss/train", TrLoss.item(), epoch) writer.add_scalar("ValLoss/train", valLoss.item(), epoch) print(TrLoss.item(), valLoss.item()) early_stopping(valLoss.item()) if early_stopping.early_stop: break #if np.mean(lossDict['valLoss'][-10:]) - valLoss.item()<= 1e-6 and epoch > RunnerObj.params['min_epochs']: #break logging.info("[Fold %s]: %.3f seconds in %s epochs" %(fID, time.process_time()-start, epoch)) writer.flush() yTrue, yPred = test(data.test_pos_edge_index, data.test_neg_edge_index) torch.save(model.state_dict(), os.path.join(training_summary_path, 'fold-'+str(fID), 'model')) testIndices = torch.cat((data.test_pos_edge_index, data.test_neg_edge_index), axis=1).detach().cpu().numpy() edgeLength = testIndices.shape[1] outMatrix = np.vstack((testIndices, yTrue, yPred, np.array([fID]*edgeLength), np.array([RunnerObj.params['hidden']]*edgeLength), np.array([RunnerObj.params['channels']]*edgeLength))) output_path = fullPath / 'rankedEdges.csv' training_stats_file_name = fullPath / 'trainingstats.csv' outDF = pd.DataFrame(outMatrix.T, columns=['Gene1','Gene2','TrueScore','PredScore', 'CVID', 'HiddenLayers', 'Channels']) outDF = outDF.astype({'Gene1': int,'Gene2': int, 'CVID': int, 'HiddenLayers': int, 'Channels': int}) outDF['Gene1'] = outDF.Gene1.map(nodeDict.item()) outDF['Gene2'] = outDF.Gene2.map(nodeDict.item()) outDF.to_csv(output_path, index=False, mode='a', header=not os.path.exists(output_path)) if os.path.isfile(training_stats_file_name): training_stats_file = open(training_stats_file_name,'a') training_stats_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(fID, RunnerObj.params['encoder']+'-'+RunnerObj.params['decoder'], RunnerObj.randSeed, RunnerObj.params['hidden'], \ RunnerObj.params['channels'], len(presentNodesSet), len(allNodes), len(set(missingNodes)), len(missingTFs), len(presentTFs), len(onlyTFs),len(sourceNodesCPY),len(sourceNodes), len(train_negIdx), len(test_posIdx), len(test_negIdx), len(val_posIdx), len(val_negIdx))) else: training_stats_file = open(training_stats_file_name, 'w') training_stats_file.write('Fold\tAlgorithm\trandID\t#HiddenLayers\tChannels\tPresentNodes\tAllNodes\tMissingNodes\tMissingTFs\tPresentTFs\tOnlyTFs\tPositiveTrainingEdges \ \tPositiveTrainingEdgesWithDummyEdges\tNegativeTrainingEdges\tPositiveTestEdges\tNegativeTestEdges\tPositiveValidationEdges\tNegativeValidationEdges\n') training_stats_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(fID, RunnerObj.params['encoder']+'-'+RunnerObj.params['decoder'], RunnerObj.randSeed, RunnerObj.params['hidden'], \ RunnerObj.params['channels'], len(presentNodesSet), len(allNodes), len(set(missingNodes)), len(missingTFs), len(presentTFs), len(onlyTFs),len(sourceNodesCPY),len(sourceNodes), len(train_negIdx), len(test_posIdx), len(test_negIdx), len(val_posIdx), len(val_negIdx))) writer.close()
parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--rezero', action='store_true') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv') split_idx = dataset.get_idx_split() data = dataset[0] edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) adj_0 = SparseTensor(row=edge_index[0], col=edge_index[1]) # Pre-compute GCN normalization. adj = adj_0.set_diag() deg = adj.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) class l_GCN(torch.nn.Module): def __init__(self, in_channels, out_channels): super(l_GCN, self).__init__() self.in_channels = in_channels self.out_channels = out_channels