def __init__(self, args): super(Trainer, self).__init__() # Random Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False self.args = args self.exp_name = self.set_experiment_name() self.use_cuda = args.gpu >= 0 and torch.cuda.is_available() if self.use_cuda: torch.cuda.set_device(args.gpu) self.args.device = 'cuda:{}'.format(args.gpu) else: self.args.device = 'cpu' self.dataset = self.load_data() self.evaluator = Evaluator(args.data)
def evaluate_network_sparse(model, device, data_loader, epoch): model.eval() epoch_test_loss = 0 epoch_test_ROC = 0 with torch.no_grad(): list_scores = [] list_labels = [] for iter, (batch_graphs, batch_labels, batch_snorm_n, batch_snorm_e) in enumerate(data_loader): batch_x = batch_graphs.ndata['feat'].to(device) batch_e = batch_graphs.edata['feat'].to(device) batch_snorm_e = batch_snorm_e.to(device) batch_snorm_n = batch_snorm_n.to(device) batch_labels = batch_labels.to(device) batch_graphs = batch_graphs.to(device) batch_scores = model.forward(batch_graphs, batch_x, batch_e, batch_snorm_n, batch_snorm_e) loss = model.loss(batch_scores, batch_labels) epoch_test_loss += loss.detach().item() list_scores.append(batch_scores.detach()) list_labels.append(batch_labels.detach().unsqueeze(-1)) epoch_test_loss /= (iter + 1) evaluator = Evaluator(name='ogbg-molhiv') epoch_test_ROC = evaluator.eval({ 'y_pred': torch.cat(list_scores), 'y_true': torch.cat(list_labels) })['rocauc'] return epoch_test_loss, epoch_test_ROC
def train_epoch_sparse(model, optimizer, device, data_loader, epoch): model.train() epoch_loss = 0 epoch_train_ROC = 0 list_scores = [] list_labels = [] for iter, (batch_graphs, batch_labels, batch_snorm_n, batch_snorm_e) in enumerate(data_loader): batch_x = batch_graphs.ndata['feat'].to(device) # num x feat batch_e = batch_graphs.edata['feat'].to(device) batch_snorm_e = batch_snorm_e.to(device) batch_snorm_n = batch_snorm_n.to(device) batch_labels = batch_labels.to(device) batch_graphs = batch_graphs.to(device) optimizer.zero_grad() batch_scores = model.forward(batch_graphs, batch_x, batch_e, batch_snorm_n, batch_snorm_e) loss = model.loss(batch_scores, batch_labels) loss.backward() optimizer.step() epoch_loss += loss.detach().item() list_scores.append(batch_scores.detach()) list_labels.append(batch_labels.detach().unsqueeze(-1)) epoch_loss /= (iter + 1) evaluator = Evaluator(name='ogbg-molhiv') epoch_train_ROC = evaluator.eval({ 'y_pred': torch.cat(list_scores), 'y_true': torch.cat(list_labels) })['rocauc'] return epoch_loss, epoch_train_ROC, optimizer
def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'): self._version = version if version is not None: raise ValueError( 'Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.' ) # internally call ogb package self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba', root=root_dir) # set variables self._data_dir = self.ogb_dataset.root if split_scheme == 'official': split_scheme = 'scaffold' self._split_scheme = split_scheme self._y_type = 'float' # although the task is binary classification, the prediction target contains nan value, thus we need float self._y_size = self.ogb_dataset.num_tasks self._n_classes = self.ogb_dataset.__num_classes__ self._split_array = torch.zeros(len(self.ogb_dataset)).long() split_idx = self.ogb_dataset.get_idx_split() self._split_array[split_idx['train']] = 0 self._split_array[split_idx['valid']] = 1 self._split_array[split_idx['test']] = 2 self._y_array = self.ogb_dataset.data.y self._metadata_fields = ['scaffold'] metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw', 'scaffold_group.npy') if not os.path.exists(metadata_file_path): download_url( 'https://snap.stanford.edu/ogb/data/misc/ogbg_molpcba/scaffold_group.npy', os.path.join(self.ogb_dataset.root, 'raw')) self._metadata_array = torch.from_numpy( np.load(metadata_file_path)).reshape(-1, 1).long() if torch_geometric.__version__ >= '1.7.0': self._collate = PyGCollater(follow_batch=[], exclude_keys=[]) else: self._collate = PyGCollater(follow_batch=[]) self._metric = Evaluator('ogbg-molpcba') super().__init__(root_dir, download, split_scheme)
def test(loader): model.eval() evaluator = Evaluator(name='ogbg-molhiv') list_pred = [] list_labels = [] for data in loader: data = data.to(device) out = model(data.x, data.edge_index, None, data.batch) list_pred.append(out) list_labels.append(data.y) epoch_test_ROC = evaluator.eval({ 'y_pred': torch.cat(list_pred), 'y_true': torch.cat(list_labels) })['rocauc'] return epoch_test_ROC
def train_epoch_sparse(model, optimizer, device, data_loader, epoch, distortion): model.train() epoch_loss = 0 epoch_train_ROC = 0 list_scores = [] list_labels = [] for iter, (batch_graphs, batch_labels, batch_snorm_n, batch_snorm_e) in enumerate(data_loader): batch_x = batch_graphs.ndata['feat'].to(device) # num x feat batch_e = batch_graphs.edata['feat'].to(device) batch_snorm_e = batch_snorm_e.to(device) batch_snorm_n = batch_snorm_n.to(device) batch_labels = batch_labels.to(device) if distortion > 1e-7: batch_graphs_eig = batch_graphs.ndata['eig'].clone() dist = (torch.rand(batch_x[:, 0].shape) - 0.5) * 2 * distortion batch_graphs.ndata['eig'][:, 1] = torch.mul( dist, torch.mean(torch.abs(batch_graphs_eig[:, 1]), dim=-1, keepdim=True)) + batch_graphs_eig[:, 1] batch_graphs.ndata['eig'][:, 2] = torch.mul( dist, torch.mean(torch.abs(batch_graphs_eig[:, 2]), dim=-1, keepdim=True)) + batch_graphs_eig[:, 2] optimizer.zero_grad() batch_scores = model.forward(batch_graphs, batch_x, batch_e, batch_snorm_n, batch_snorm_e) loss = model.loss(batch_scores, batch_labels) loss.backward() optimizer.step() epoch_loss += loss.detach().item() list_scores.append(batch_scores.detach()) list_labels.append(batch_labels.detach().unsqueeze(-1)) if distortion > 1e-7: batch_graphs.ndata['eig'] = batch_graphs_eig.detach() epoch_loss /= (iter + 1) evaluator = Evaluator(name='ogbg-molhiv') epoch_train_ROC = evaluator.eval({ 'y_pred': torch.cat(list_scores), 'y_true': torch.cat(list_labels) })['rocauc'] return epoch_loss, epoch_train_ROC, optimizer
def get_data(name, batch_size, rwr=False, cleaned=False): if name == 'ogbg-molhiv': data_train, data_val, data_test, max_num_nodes = get_molhiv() num_classes = 2 elif name == 'ZINC': data_train, data_val, data_test = get_mod_zinc(rwr) max_num_nodes = 37 num_classes = 1 elif name == 'SMNIST': data_train, data_val, data_test = get_smnist(rwr) max_num_nodes = 75 num_classes = 10 else: data = get_tudataset(name, rwr, cleaned=cleaned) num_classes = data.num_classes max_num_nodes = 0 for d in data: max_num_nodes = max(d.num_nodes, max_num_nodes) data_train, data_val, data_test = data_split(data) stats = dict() stats['num_features'] = data_train.num_node_features stats['num_classes'] = num_classes stats['max_num_nodes'] = max_num_nodes evaluator, encode_edge = (Evaluator(name), True) if name == 'ogbg-molhiv' else (None, False) train_loader = DataLoader(data_train, batch_size, shuffle=True) val_loader = DataLoader(data_val, batch_size, shuffle=False) test_loader = DataLoader(data_test, batch_size, shuffle=False) return train_loader, val_loader, test_loader, stats, evaluator, encode_edge
def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def __init__(self, criterion=torch.nn.BCEWithLogitsLoss(pos_weight = torch.tensor([30]))): super().__init__() # loading params with open('parameters.json') as json_file: parameters = json.load(json_file) self.configuration = parameters self.save_hyperparameters( dict( batch_size = parameters["batch_size"], lr=parameters["learning_rate"], weight_decay=parameters["weight_decay"], num_workers=parameters["num_workers"], criterion=criterion, epochs=parameters["epochs"], ) ) self._train_data = None self._collate_fn = None self._train_loader = None self.batch_size = self.configuration["batch_size"] self.num_workers = self.configuration["num_workers"] self.lr = self.configuration["learning_rate"] self.epochs=self.configuration["epochs"] self.weight_decay = self.configuration["weight_decay"] self.criterion = criterion self.evaluator = Evaluator(parameters["dataset_name"])
def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.norm = norm dataset = DownloadPCBA(name='ogbg-molpcba') split_idx = dataset.get_idx_split() self.train = PCBADGL(dataset, split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = PCBADGL(dataset, split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = PCBADGL(dataset, split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) del dataset del split_idx self.evaluator = Evaluator(name='ogbg-molpcba') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def task_data(args, dataset=None): # DATA_ROOT = '/mnt/localdata/users/shengjie/ogb_ws/data/dataset' # step 0: setting for gpu if args.gpu >= 0: torch.cuda.set_device(args.gpu) # step 1: prepare dataset if dataset is None: dataset = DglGraphPropPredDataset(name=args.dataset, root=args.data_dir) splitted_idx = dataset.get_idx_split() # step 2: prepare data_loader train_loader = DataLoader(dataset[splitted_idx['train']], batch_size=args.batch_size, shuffle=True, collate_fn=collate_dgl, num_workers=4) valid_loader = DataLoader(dataset[splitted_idx['valid']], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=4) test_loader = DataLoader(dataset[splitted_idx['test']], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=4) evaluator = Evaluator(args.dataset) return dataset, evaluator, train_loader, valid_loader, test_loader
def __init__(self, architecture: str = "GCN", num_node_features: int = 300, activation: str = "prelu", num_conv_layers: int = 3, conv_size: int = 256, pool_method: str = "add", lin1_size: int = 128, lin2_size: int = 64, output_size: int = 128, lr: float = 0.001, weight_decay: float = 0, **kwargs): super().__init__() # this line ensures params passed to LightningModule will be saved to ckpt # it also allows to access params with 'self.hparams' attribute self.save_hyperparameters(logger=False) # init node embedding layer self.atom_encoder = AtomEncoder(emb_dim=self.hparams.num_node_features) # self.bond_encoder = BondEncoder(emb_dim=self.hparams.edge_emb_size) # init network architecture if self.hparams.architecture == "GCN": self.model = gcn.GCN(hparams=self.hparams) elif self.hparams.architecture == "GAT": self.model = gat.GAT(hparams=self.hparams) elif self.hparams.architecture == "GraphSAGE": self.model = graph_sage.GraphSAGE(hparams=self.hparams) elif self.hparams.architecture == "GIN": self.model = gin.GIN(hparams=self.hparams) else: raise Exception("Incorrect architecture name!") # loss function self.criterion = torch.nn.BCEWithLogitsLoss() # metric self.evaluator = Evaluator(name="ogbg-molpcba") self.metric_hist = { "train/ap": [], "val/ap": [], "train/loss": [], "val/loss": [], }
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygGraphPropPredDataset(name=args.dataset) args.num_tasks = dataset.num_tasks print(args) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) train_result = eval(model, device, train_loader, evaluator)[dataset.eval_metric] valid_result = eval(model, device, valid_loader, evaluator)[dataset.eval_metric] test_result = eval(model, device, test_loader, evaluator)[dataset.eval_metric] print({ 'Train': train_result, 'Validation': valid_result, 'Test': test_result }) model.print_params(final=True)
def main(): # Training settings parser = argparse.ArgumentParser(description='GIN with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-mol-tox21", help='dataset name (default: ogbg-mol-tox21)') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = GIN(num_task=dataset.num_tasks).to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) for epoch in range(1, args.epochs + 1): train(model, device, train_loader, optimizer) #print("Evaluating training...") #print(eval(model, device, train_loader, evaluator)) print("Evaluating validation:") print(eval(model, device, valid_loader, evaluator))
def train_epoch(model, optimizer, device, data_loader, epoch): model.train() epoch_loss = 0 epoch_train_AP = 0 list_scores = [] list_labels = [] for iter, (batch_graphs, batch_targets) in enumerate(data_loader): batch_graphs = batch_graphs.to(device) batch_x = batch_graphs.ndata['feat'].to(device) # num x feat batch_e = batch_graphs.edata['feat'].to(device) batch_targets = batch_targets.to(device) optimizer.zero_grad() try: batch_lap_pos_enc = batch_graphs.ndata['lap_pos_enc'].to(device) sign_flip = torch.rand(batch_lap_pos_enc.size(1)).to(device) sign_flip[sign_flip >= 0.5] = 1.0 sign_flip[sign_flip < 0.5] = -1.0 batch_lap_pos_enc = batch_lap_pos_enc * sign_flip.unsqueeze(0) except: batch_lap_pos_enc = None try: batch_wl_pos_enc = batch_graphs.ndata['wl_pos_enc'].to(device) except: batch_wl_pos_enc = None batch_scores = model.forward(batch_graphs, batch_x, batch_e, batch_lap_pos_enc, batch_wl_pos_enc) is_labeled = batch_targets == batch_targets loss = model.loss(batch_scores[is_labeled], batch_targets.float()[is_labeled]) loss.backward() optimizer.step() epoch_loss += loss.detach().item() list_scores.append(batch_scores.detach().cpu()) list_labels.append(batch_targets.detach().cpu()) epoch_loss /= (iter + 1) evaluator = Evaluator(name='ogbg-molpcba') epoch_train_AP = evaluator.eval({ 'y_pred': torch.cat(list_scores), 'y_true': torch.cat(list_labels) })['ap'] return epoch_loss, epoch_train_AP, optimizer
def eval_on(self, loader, trainer): results_dict = super().eval_on(loader, trainer) evaluator = GraphPropEvaluator(name=self.task_name) y_trues = [] y_preds = [] for batch in loader: if trainer.on_gpu: batch = batch.to("cuda") y_preds.append(self.model(batch).cpu().detach().numpy()) y_trues.append(batch.y.cpu().detach().numpy()) y_trues = np.concatenate(y_trues, axis=0) y_preds = np.concatenate(y_preds, axis=0) results_dict.update( evaluator.eval({ "y_true": y_trues, "y_pred": y_preds })) return results_dict
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') if args.not_extract_node_feature: dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) else: extract_node_feature_func = partial(extract_node_feature, reduce=args.aggr) dataset = PygGraphPropPredDataset(name=args.dataset, transform=extract_node_feature_func) args.num_tasks = dataset.num_classes evaluator = Evaluator(args.dataset) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) print(args) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) train_accuracy = eval(model, device, train_loader, evaluator) valid_accuracy = eval(model, device, valid_loader, evaluator) test_accuracy = eval(model, device, test_loader, evaluator) print({ 'Train': train_accuracy, 'Validation': valid_accuracy, 'Test': test_accuracy }) model.print_params(final=True)
def __init__(self, train): super(Mol_pred_DNN_dataset, self).__init__() self.train = train dataset_name = 'ogbg-molhiv' mol_origin_dataset = PygGraphPropPredDataset(name=dataset_name) evaluator = Evaluator(name=dataset_name) split_idx = mol_origin_dataset.get_idx_split() if self.train == True: self.mol_origin_dataset = mol_origin_dataset[split_idx["train"]] else: self.mol_origin_dataset = mol_origin_dataset[split_idx["test"]]
def mol_pred_GNN_prepare(batch_size=50): dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(name=dataset_name) evaluator = Evaluator(name=dataset_name) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False) return train_loader, test_loader
def evaluate_network(model, device, data_loader, epoch): model.eval() epoch_test_loss = 0 epoch_test_AP = 0 with torch.no_grad(): list_scores = [] list_labels = [] for iter, (batch_graphs, batch_targets) in enumerate(data_loader): batch_graphs = batch_graphs.to(device) batch_x = batch_graphs.ndata['feat'].to(device) batch_e = batch_graphs.edata['feat'].to(device) batch_targets = batch_targets.to(device) try: batch_lap_pos_enc = batch_graphs.ndata['lap_pos_enc'].to( device) except: batch_lap_pos_enc = None try: batch_wl_pos_enc = batch_graphs.ndata['wl_pos_enc'].to(device) except: batch_wl_pos_enc = None batch_scores = model.forward(batch_graphs, batch_x, batch_e, batch_lap_pos_enc, batch_wl_pos_enc) is_labeled = batch_targets == batch_targets loss = model.loss(batch_scores[is_labeled], batch_targets.float()[is_labeled]) epoch_test_loss += loss.detach().item() list_scores.append(batch_scores.detach().cpu()) list_labels.append(batch_targets.detach().cpu()) epoch_test_loss /= (iter + 1) evaluator = Evaluator(name='ogbg-molpcba') epoch_test_AP = evaluator.eval({ 'y_pred': torch.cat(list_scores), 'y_true': torch.cat(list_labels) })['ap'] return epoch_test_loss, epoch_test_AP
def __init__(self, name, pos_enc_dim=0, norm='none', path='dataset/ogbg-molhiv', directions=['subgraphs'], verbose=True, **subgraph_params): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name ##### MODIFIED CODE HERE if 'subgraphs' in directions: self.dataset, self.split_idx = prepare_dataset( path, name, **subgraph_params) print("One hot encoding substructure counts... ", end='') self.dataset, self.d_id = encode(self.dataset, subgraph_params['id_encoding']) else: self.dataset = DglGraphPropPredDataset(name=name, root=path) self.split_idx = self.dataset.get_idx_split() self.d_id = None self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) ##### MODIFIED CODE HERE #import pdb;pdb.set_trace() self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def __init__(self, root_dir='data', download=False, split_scheme='official'): # internally call ogb package self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba', root=root_dir) # set variables self._dataset_name = 'ogbg-molpcba' self._data_dir = self.ogb_dataset.root if split_scheme == 'official': split_scheme = 'scaffold' self._split_scheme = split_scheme self._y_type = 'float' # although the task is binary classification, the prediction target contains nan value, thus we need float self._y_size = self.ogb_dataset.num_tasks self._n_classes = self.ogb_dataset.__num_classes__ self._split_array = torch.zeros(len(self.ogb_dataset)).long() split_idx = self.ogb_dataset.get_idx_split() self._split_array[split_idx['train']] = 0 self._split_array[split_idx['valid']] = 1 self._split_array[split_idx['test']] = 2 self._y_array = self.ogb_dataset.data.y self._metadata_fields = ['scaffold'] metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw', 'scaffold_group.npy') if not os.path.exists(metadata_file_path): download_url('', os.path.join(self.ogb_dataset.root, 'raw')) self._metadata_array = torch.from_numpy( np.load(metadata_file_path)).reshape(-1, 1).long() self._collate = PyGCollater(follow_batch=[]) self._metric = Evaluator('ogbg-molpcba') super().__init__(root_dir, download, split_scheme)
def __init__(self, name): start = time.time() print("[I] Loading dataset %s..." % (name)) self.name = name dataset = DownloadPCBA(name='ogbg-molpcba') split_idx = dataset.get_idx_split() self.train = PCBADGL(dataset, split_idx['train']) self.val = PCBADGL(dataset, split_idx['valid']) self.test = PCBADGL(dataset, split_idx['test']) del dataset del split_idx self.evaluator = Evaluator(name='ogbg-molpcba') print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def get_data(name, batch_size): if name == "ogbg-molhiv": data_train, data_val, data_test, max_num_nodes = get_molhiv() num_classes = 2 else: raise ValueError("dataset not supported") stats = dict() stats["num_features"] = data_train.num_node_features stats["num_classes"] = num_classes stats["max_num_nodes"] = max_num_nodes evaluator = Evaluator(name) encode_edge = True train_loader = DataLoader(data_train, batch_size, shuffle=True) val_loader = DataLoader(data_val, batch_size, shuffle=False) test_loader = DataLoader(data_test, batch_size, shuffle=False) return train_loader, val_loader, test_loader, stats, evaluator, encode_edge
def __init__(self, name, re_split=False, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() if re_split: ind = [i for i in range(41127)] rd.shuffle(ind) self.split_idx = { 'test': torch.tensor([ind[i] for i in range(36564, 41127)]), 'train': torch.tensor([ind[i] for i in range(32000)]), 'valid': torch.tensor([ind[i] for i in range(32000, 36564)]) } self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def run(args): from ogb.graphproppred import DglGraphPropPredDataset, Evaluator, collate_dgl from torch.utils.data import DataLoader dataset = DglGraphPropPredDataset(name="ogbg-molhiv") import os if not os.path.exists("heterographs.bin"): dataset.graphs = [hpno.heterograph(graph) for graph in dataset.graphs] from dgl.data.utils import save_graphs save_graphs("heterographs.bin", dataset.graphs) else: from dgl.data.utils import load_graphs dataset.graphs = load_graphs("heterographs.bin")[0] evaluator = Evaluator(name="ogbg-molhiv") in_features = 9 out_features = 1 split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=128, drop_last=True, shuffle=True, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=len(split_idx["valid"]), shuffle=False, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=len(split_idx["test"]), shuffle=False, collate_fn=collate_dgl) model = hpno.HierarchicalPathNetwork( in_features=in_features, out_features=args.hidden_features, hidden_features=args.hidden_features, depth=args.depth, readout=hpno.GraphReadout( in_features=args.hidden_features, out_features=out_features, hidden_features=args.hidden_features, ) ) if torch.cuda.is_available(): model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=20) for idx_epoch in range(args.n_epochs): print(idx_epoch, flush=True) model.train() for g, y in train_loader: y = y.float() if torch.cuda.is_available(): g = g.to("cuda:0") y = y.cuda() optimizer.zero_grad() y_hat = model.forward(g, g.nodes['n1'].data["feat"].float()) loss = torch.nn.BCELoss()( input=y_hat.sigmoid(), target=y, ) loss.backward() optimizer.step() model.eval() with torch.no_grad(): g, y = next(iter(valid_loader)) y = y.float() if torch.cuda.is_available(): g = g.to("cuda:0") y = y.cuda() y_hat = model.forward(g, g.nodes['n1'].data["feat"].float()) loss = torch.nn.BCELoss()( input=y_hat.sigmoid(), target=y, ) scheduler.step(loss) if optimizer.param_groups[0]["lr"] <= 0.01 * args.learning_rate: break model = model.cpu() g, y = next(iter(valid_loader)) rocauc_vl = evaluator.eval( { "y_true": y.float(), "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid() } )["rocauc"] g, y = next(iter(test_loader)) rocauc_te = evaluator.eval( { "y_true": y.float(), "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid() } )["rocauc"] import pandas as pd df = pd.DataFrame( { args.data: { "rocauc_te": rocauc_te, "rocauc_vl": rocauc_vl, } } ) df.to_csv("%s.csv" % args.out)
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppi data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppi", help='dataset name (default: ogbg-ppi)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
step = loss = 0 for batch in loader_tr: step += 1 loss += train_step(*batch) if step == loader_tr.steps_per_epoch: step = 0 print("Loss: {}".format(loss / loader_tr.steps_per_epoch)) loss = 0 ################################################################################ # Evaluate model ################################################################################ print("Testing model") evaluator = Evaluator(name=dataset_name) y_true = [] y_pred = [] for batch in loader_te: inputs, target = batch p = model(inputs, training=False) y_true.append(target) y_pred.append(p.numpy()) y_true = np.vstack(y_true) y_pred = np.vstack(y_pred) model_loss = loss_fn(y_true, y_pred) ogb_score = evaluator.eval({"y_true": y_true, "y_pred": y_pred}) print( "Done. Test loss: {:.4f}. ROC-AUC: {:.2f}".format(model_loss, ogb_score["rocauc"])
def main(_): tf.keras.mixed_precision.set_global_policy("float16" if FLAGS.dtype == 'float16' else "float32") dset_name = 'ogbg-molhiv' dataset = GraphPropPredDataset(name=dset_name, ) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in train_idx], shuffle=True) val_ds = data.get_tf_dataset(FLAGS.batch_size, [dataset[idx] for idx in valid_idx], shuffle=False) strategy = xpu.configure_and_get_strategy() if FLAGS.total_batch_size is not None: gradient_accumulation_factor = FLAGS.total_batch_size // FLAGS.batch_size else: gradient_accumulation_factor = 1 # pre-calculated number of steps per epoch (note: will vary somewhat for training, due to packing, # but is found to be fairly consistent) steps = { 32: (1195, 162, 148), 64: (585, 80, 73), 128: (288, 40, 37), 256: (143, 20, 18) } try: steps_per_epoch, val_steps_per_epoch, test_steps_per_epoch = steps[FLAGS.batch_size] except KeyError: print("Batch size should have the number of steps defined") raise KeyError() # need the steps per epoch to be divisible by the gradient accumulation factor steps_per_epoch = gradient_accumulation_factor * (steps_per_epoch // gradient_accumulation_factor) # we apply a linear scaling rule for learning rate with batch size, which we benchmark against BS=128 batch_size = FLAGS.total_batch_size or FLAGS.batch_size lr = FLAGS.lr * batch_size / 128 with strategy.scope(): model = create_model() utils.print_trainable_variables(model) losses = tf.keras.losses.BinaryCrossentropy() if FLAGS.opt.lower() == 'sgd': opt = tf.keras.optimizers.SGD(learning_rate=lr) elif FLAGS.opt.lower() == 'adam': opt = tf.keras.optimizers.Adam(learning_rate=lr) else: raise NotImplementedError() callbacks = [] if not os.path.isdir(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # randomly named directory model_dir = os.path.join(FLAGS.model_dir, str(uuid.uuid4())) print(f"Saving weights to {model_dir}") model_path = os.path.join(model_dir, 'model') callbacks.append(tf.keras.callbacks.ModelCheckpoint( model_path, monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, mode="min", save_freq="epoch") ) callbacks.append(ThroughputCallback( samples_per_epoch=steps_per_epoch * FLAGS.batch_size * gradient_accumulation_factor)) if FLAGS.reduce_lr_on_plateau_patience > 0: callbacks.append(tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', mode='min', factor=FLAGS.reduce_lr_on_plateau_factor, patience=FLAGS.reduce_lr_on_plateau_patience, min_lr=1e-8, verbose=1) ) if FLAGS.early_stopping_patience > 0: print(f"Training will stop early after {FLAGS.early_stopping_patience} epochs without improvement.") callbacks.append( tf.keras.callbacks.EarlyStopping( monitor='val_loss', min_delta=0, patience=FLAGS.early_stopping_patience, verbose=1, mode='min', baseline=None, restore_best_weights=False) ) # weighted metrics are used because of the batch packing model.compile(optimizer=opt, loss=losses, weighted_metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()], steps_per_execution=steps_per_epoch) # if the total batch size exceeds the compute batch size model.set_gradient_accumulation_options(gradient_accumulation_steps_per_replica=gradient_accumulation_factor) model.fit(ds, steps_per_epoch=steps_per_epoch, epochs=FLAGS.epochs, validation_data=val_ds, validation_steps=val_steps_per_epoch, callbacks=callbacks ) # we will use the official AUC evaluator from the OGB repo, not the keras one model.load_weights(model_path) print("Loaded best validation weights for evaluation") evaluator = Evaluator(name='ogbg-molhiv') for test_or_val, idx, steps in zip( ('validation', 'test'), (valid_idx, test_idx), (val_steps_per_epoch, test_steps_per_epoch)): prediction, ground_truth = get_predictions(model, dataset, idx, steps) result = evaluator.eval({'y_true': ground_truth[:, None], 'y_pred': prediction[:, None]}) print(f'Final {test_or_val} ROC-AUC {result["rocauc"]:.3f}')
def main(): parser = argparse.ArgumentParser(description='OGBN-MolHiv') parser.add_argument('--device', type=int, default=0) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--num_layers', type=int, default=5) parser.add_argument('--emb_dim', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--eval', action='store_true', help='If not set, we will only do the training part.') parser.add_argument('--eval_batch_size', type=int, default=2048) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = DglGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbg-molhiv') train_loader = GraphDataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = GraphDataLoader(dataset[split_idx["valid"]], batch_size=args.eval_batch_size, shuffle=True, num_workers=0) test_loader = GraphDataLoader(dataset[split_idx["test"]], batch_size=args.eval_batch_size, shuffle=True, num_workers=0) model = GCN(args.emb_dim, num_classes=dataset.num_tasks, num_layers=args.num_layers, dropout=args.dropout).to(device) logger = Logger(args.runs, args) dur = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): t0 = time.time() loss = train(model, device, train_loader, optimizer) if epoch >= 3: dur.append(time.time() - t0) print('Training time/epoch {}'.format(np.mean(dur))) if not args.eval: continue val_rocauc = test(model, device, val_loader, evaluator)[dataset.eval_metric] test_rocauc = test(model, device, test_loader, evaluator)[dataset.eval_metric] logger.add_result(run, (0.0, val_rocauc, test_rocauc)) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Valid: {val_rocauc:.4f} ' f'Test: {test_rocauc:.4f}') if args.eval: logger.print_statistics(run) if args.eval: logger.print_statistics()