def test_train(tmpdir): import pytorch_lightning as pl load_cfg(cfg, args) cfg.out_dir = osp.join(tmpdir, str(random.randrange(sys.maxsize))) cfg.run_dir = osp.join(tmpdir, str(random.randrange(sys.maxsize))) cfg.dataset.dir = osp.join(tmpdir, 'pyg_test_datasets', 'Planetoid') set_out_dir(cfg.out_dir, args.cfg_file) dump_cfg(cfg) set_printing() seed_everything(cfg.seed) auto_select_device() set_run_dir(cfg.out_dir) loaders = create_loader() model = create_model() cfg.params = params_count(model) logger = LoggerCallback() trainer = pl.Trainer(max_epochs=1, max_steps=4, callbacks=logger, log_every_n_steps=1) train_loader, val_loader = loaders[0], loaders[1] trainer.fit(model, train_loader, val_loader) shutil.rmtree(cfg.out_dir)
def main(): seed_everything(42) root = osp.join('data', 'TUDataset') dataset = TUDataset(root, 'IMDB-BINARY', pre_transform=T.OneHotDegree(135)) dataset = dataset.shuffle() test_dataset = dataset[:len(dataset) // 10] val_dataset = dataset[len(dataset) // 10:2 * len(dataset) // 10] train_dataset = dataset[2 * len(dataset) // 10:] datamodule = LightningDataset(train_dataset, val_dataset, test_dataset, batch_size=64, num_workers=4) model = Model(dataset.num_node_features, dataset.num_classes) devices = torch.cuda.device_count() strategy = pl.strategies.DDPSpawnStrategy(find_unused_parameters=False) checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_acc', save_top_k=1) trainer = pl.Trainer(strategy=strategy, accelerator='gpu', devices=devices, max_epochs=50, log_every_n_steps=5, callbacks=[checkpoint]) trainer.fit(model, datamodule) trainer.test(ckpt_path='best', datamodule=datamodule)
def test_run_single_graphgym(auto_resume, skip_train_eval, use_trivial_metric): Args = namedtuple('Args', ['cfg_file', 'opts']) root = osp.join(osp.dirname(osp.realpath(__file__))) args = Args(osp.join(root, 'example_node.yml'), []) load_cfg(cfg, args) cfg.out_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.run_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.dataset.dir = osp.join('/', 'tmp', 'pyg_test_datasets', 'Planetoid') cfg.train.auto_resume = auto_resume set_out_dir(cfg.out_dir, args.cfg_file) dump_cfg(cfg) set_printing() seed_everything(cfg.seed) auto_select_device() set_run_dir(cfg.out_dir) cfg.train.skip_train_eval = skip_train_eval cfg.train.enable_ckpt = use_trivial_metric and skip_train_eval if use_trivial_metric: if 'trivial' not in register.metric_dict: register.register_metric('trivial', trivial_metric) global num_trivial_metric_calls num_trivial_metric_calls = 0 cfg.metric_best = 'trivial' cfg.custom_metrics = ['trivial'] else: cfg.metric_best = 'auto' cfg.custom_metrics = [] datamodule = GraphGymDataModule() assert len(datamodule.loaders) == 3 model = create_model() assert isinstance(model, torch.nn.Module) assert isinstance(model.encoder, FeatureEncoder) assert isinstance(model.mp, GNNStackStage) assert isinstance(model.post_mp, GNNNodeHead) assert len(list(model.pre_mp.children())) == cfg.gnn.layers_pre_mp optimizer, scheduler = model.configure_optimizers() assert isinstance(optimizer[0], torch.optim.Adam) assert isinstance(scheduler[0], torch.optim.lr_scheduler.CosineAnnealingLR) cfg.params = params_count(model) assert cfg.params == 23883 train(model, datamodule, logger=True, trainer_config={"enable_progress_bar": False}) assert osp.isdir(get_ckpt_dir()) is cfg.train.enable_ckpt agg_runs(cfg.out_dir, cfg.metric_best) shutil.rmtree(cfg.out_dir)
def test_seed_everything(): seed_everything(0) assert random.randint(0, 100) == 49 assert random.randint(0, 100) == 97 assert np.random.randint(0, 100) == 44 assert np.random.randint(0, 100) == 47 assert int(torch.randint(0, 100, (1, ))) == 44 assert int(torch.randint(0, 100, (1, ))) == 39
def main(): seed_everything(42) datamodule = DataModule('../../data/OGB') model = RelationalGNN(datamodule.metadata(), hidden_channels=64, out_channels=349, dropout=0.0) checkpoint_callback = ModelCheckpoint(monitor='val_acc', save_top_k=1) trainer = Trainer(accelerator='gpu', devices=1, max_epochs=20, callbacks=[checkpoint_callback]) trainer.fit(model, datamodule) trainer.test()
def test_graphgym_module(tmpdir): import pytorch_lightning as pl load_cfg(cfg, args) cfg.out_dir = osp.join(tmpdir, str(random.randrange(sys.maxsize))) cfg.run_dir = osp.join(tmpdir, str(random.randrange(sys.maxsize))) cfg.dataset.dir = osp.join(tmpdir, 'pyg_test_datasets', 'Planetoid') set_out_dir(cfg.out_dir, args.cfg_file) dump_cfg(cfg) set_printing() seed_everything(cfg.seed) auto_select_device() set_run_dir(cfg.out_dir) loaders = create_loader() assert len(loaders) == 3 model = create_model() assert isinstance(model, pl.LightningModule) optimizer, scheduler = model.configure_optimizers() assert isinstance(optimizer[0], torch.optim.Adam) assert isinstance(scheduler[0], torch.optim.lr_scheduler.CosineAnnealingLR) cfg.params = params_count(model) assert cfg.params == 23880 keys = {"loss", "true", "pred_score", "step_end_time"} # test training step batch = next(iter(loaders[0])) batch.to(model.device) outputs = model.training_step(batch) assert keys == set(outputs.keys()) assert isinstance(outputs["loss"], torch.Tensor) # test validation step batch = next(iter(loaders[1])) batch.to(model.device) outputs = model.validation_step(batch) assert keys == set(outputs.keys()) assert isinstance(outputs["loss"], torch.Tensor) # test test step batch = next(iter(loaders[2])) batch.to(model.device) outputs = model.test_step(batch) assert keys == set(outputs.keys()) assert isinstance(outputs["loss"], torch.Tensor) shutil.rmtree(cfg.out_dir)
def test_run_single_graphgym(): Args = namedtuple('Args', ['cfg_file', 'opts']) root = osp.join(osp.dirname(osp.realpath(__file__))) args = Args(osp.join(root, 'example_node.yml'), []) load_cfg(cfg, args) cfg.out_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.run_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.dataset.dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) dump_cfg(cfg) set_printing() seed_everything(cfg.seed) auto_select_device() set_run_dir(cfg.out_dir, args.cfg_file) loaders = create_loader() assert len(loaders) == 3 loggers = create_logger() assert len(loggers) == 3 model = create_model() assert isinstance(model, torch.nn.Module) assert isinstance(model.encoder, FeatureEncoder) assert isinstance(model.mp, GNNStackStage) assert isinstance(model.post_mp, GNNNodeHead) assert len(list(model.pre_mp.children())) == cfg.gnn.layers_pre_mp optimizer_config = OptimizerConfig(optimizer=cfg.optim.optimizer, base_lr=cfg.optim.base_lr, weight_decay=cfg.optim.weight_decay, momentum=cfg.optim.momentum) optimizer = create_optimizer(model.parameters(), optimizer_config) assert isinstance(optimizer, torch.optim.Adam) scheduler_config = SchedulerConfig(scheduler=cfg.optim.scheduler, steps=cfg.optim.steps, lr_decay=cfg.optim.lr_decay, max_epoch=cfg.optim.max_epoch) scheduler = create_scheduler(optimizer, scheduler_config) assert isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingLR) cfg.params = params_count(model) assert cfg.params == 23880 train(loggers, loaders, model, optimizer, scheduler) agg_runs(set_agg_dir(cfg.out_dir, args.cfg_file), cfg.metric_best) shutil.rmtree(cfg.out_dir) shutil.rmtree(cfg.dataset.dir)
def main(): seed_everything(42) datamodule = RedditDataModule('data/Reddit') model = GraphSAGE(datamodule.num_features, datamodule.num_classes) checkpoint_callback = ModelCheckpoint(monitor='val_acc', save_top_k=1) trainer = Trainer(gpus=1, max_epochs=10, callbacks=[checkpoint_callback]) # Uncomment to train on multiple GPUs: # trainer = Trainer(gpus=2, accelerator='ddp', max_epochs=10, # callbacks=[checkpoint_callback]) trainer.fit(model, datamodule=datamodule) trainer.test()
def main(): seed_everything(42) dataset = Reddit(osp.join('data', 'Reddit')) data = dataset[0] datamodule = LightningNodeData(data, data.train_mask, data.val_mask, data.test_mask, loader='neighbor', num_neighbors=[25, 10], batch_size=1024, num_workers=8) model = Model(dataset.num_node_features, dataset.num_classes) devices = torch.cuda.device_count() strategy = pl.strategies.DDPSpawnStrategy(find_unused_parameters=False) checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_acc', save_top_k=1) trainer = pl.Trainer(strategy=strategy, accelerator='gpu', devices=devices, max_epochs=20, callbacks=[checkpoint]) trainer.fit(model, datamodule) trainer.test(ckpt_path='best', datamodule=datamodule)
def main(): seed_everything(42) root = osp.join('data', 'TUDataset') dataset = TUDataset(root, 'IMDB-BINARY', pre_transform=T.OneHotDegree(135)) dataset = dataset.shuffle() test_dataset = dataset[:len(dataset) // 10] val_dataset = dataset[len(dataset) // 10:2 * len(dataset) // 10] train_dataset = dataset[2 * len(dataset) // 10:] train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=64, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=64, pin_memory=True) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Model(dataset.num_node_features, dataset.num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) metrics = {'acc': ignite.metrics.Accuracy()} def prepare_batch_fn(batch, device, non_blocking): return (batch.to(device, non_blocking=non_blocking), batch.y.to(device, non_blocking=non_blocking)) trainer = ignite.engine.create_supervised_trainer( model=model, optimizer=optimizer, loss_fn=F.cross_entropy, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred, loss: loss.item(), amp_mode='amp', ) # Progress bar for each epoch: pbar = ignite.contrib.handlers.tqdm_logger.ProgressBar() pbar.attach(trainer, output_transform=lambda x: {'loss': x}) def log_metrics(evaluator, loader, tag): def logger(trainer): evaluator.run(loader) print(f'{tag:10} Epoch: {trainer.state.epoch:02d}, ' f'Acc: {evaluator.state.metrics["acc"]:.4f}') return logger train_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( train_evaluator, train_loader, 'Training')) val_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( val_evaluator, val_loader, 'Validation')) test_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( test_evaluator, test_loader, 'Test')) # Save checkpoint of the model based on Accuracy on the validation set: checkpoint_handler = ignite.handlers.Checkpoint( {'model': model}, 'runs/gin', n_saved=2, score_name=list(metrics.keys())[0], filename_pattern='best-{global_step}-{score_name}-{score}.pt', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) val_evaluator.add_event_handler(ignite.engine.Events.EPOCH_COMPLETED, checkpoint_handler) # Create a tensorboard logger to write logs: tb_logger = ignite.contrib.handlers.tensorboard_logger.TensorboardLogger( log_dir=osp.join('runs/example', 'tb_logs')) tb_logger.attach_output_handler( trainer, event_name=ignite.engine.Events.ITERATION_COMPLETED, tag='training', output_transform=lambda loss: {'loss_iteration': loss}) tb_logger.attach_output_handler( trainer, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='training', output_transform=lambda loss: {'loss_epoch': loss}) tb_logger.attach_output_handler( train_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='training', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.attach_output_handler( val_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='validation', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.attach_output_handler( test_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='test', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.close() trainer.run(train_loader, max_epochs=50)
def test_run_single_graphgym(skip_train_eval, use_trivial_metric): Args = namedtuple('Args', ['cfg_file', 'opts']) root = osp.join(osp.dirname(osp.realpath(__file__))) args = Args(osp.join(root, 'example_node.yml'), []) load_cfg(cfg, args) cfg.out_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.run_dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) cfg.dataset.dir = osp.join('/', 'tmp', str(random.randrange(sys.maxsize))) dump_cfg(cfg) set_printing() seed_everything(cfg.seed) auto_select_device() set_run_dir(cfg.out_dir, args.cfg_file) cfg.train.skip_train_eval = skip_train_eval cfg.train.enable_ckpt = use_trivial_metric and skip_train_eval if use_trivial_metric: if 'trivial' not in register.metric_dict: register.register_metric('trivial', trivial_metric) global num_trivial_metric_calls num_trivial_metric_calls = 0 cfg.metric_best = 'trivial' cfg.custom_metrics = ['trivial'] else: cfg.metric_best = 'auto' cfg.custom_metrics = [] loaders = create_loader() assert len(loaders) == 3 loggers = create_logger() assert len(loggers) == 3 model = create_model() assert isinstance(model, torch.nn.Module) assert isinstance(model.encoder, FeatureEncoder) assert isinstance(model.mp, GNNStackStage) assert isinstance(model.post_mp, GNNNodeHead) assert len(list(model.pre_mp.children())) == cfg.gnn.layers_pre_mp optimizer = create_optimizer(model.parameters(), cfg.optim) assert isinstance(optimizer, torch.optim.Adam) scheduler = create_scheduler(optimizer, cfg.optim) assert isinstance(scheduler, torch.optim.lr_scheduler.CosineAnnealingLR) cfg.params = params_count(model) assert cfg.params == 23880 train(loggers, loaders, model, optimizer, scheduler) if use_trivial_metric: # 6 total epochs, 4 eval epochs, 3 splits (1 training split) assert num_trivial_metric_calls == 12 if skip_train_eval else 14 assert osp.isdir(get_ckpt_dir()) is cfg.train.enable_ckpt agg_runs(set_agg_dir(cfg.out_dir, args.cfg_file), cfg.metric_best) shutil.rmtree(cfg.out_dir) shutil.rmtree(cfg.dataset.dir)
if __name__ == '__main__': # Load cmd line args args = parse_args() # Load config file load_cfg(cfg, args) set_out_dir(cfg.out_dir, args.cfg_file) # Set Pytorch environment torch.set_num_threads(cfg.num_threads) dump_cfg(cfg) # Repeat for different random seeds for i in range(args.repeat): set_run_dir(cfg.out_dir) set_printing() # Set configurations for each run cfg.seed = cfg.seed + 1 seed_everything(cfg.seed) auto_select_device() # Set machine learning pipeline datamodule = GraphGymDataModule() model = create_model() # Print model info logging.info(model) logging.info(cfg) cfg.params = params_count(model) logging.info('Num parameters: %s', cfg.params) train(model, datamodule, logger=True) # Aggregate results from different seeds agg_runs(cfg.out_dir, cfg.metric_best) # When being launched in batch mode, mark a yaml as done if args.mark_done:
max_epoch=cfg.optim.max_epoch) if __name__ == '__main__': # Load cmd line args args = parse_args() # Load config file load_cfg(cfg, args) # Set Pytorch environment torch.set_num_threads(cfg.num_threads) dump_cfg(cfg) set_printing() # Repeat for different random seeds for i in range(args.repeat): # Set configurations for each run seed_everything(cfg.seed + i) auto_select_device() set_run_dir(cfg.out_dir, args.cfg_file) # Set machine learning pipeline loaders = create_loader() loggers = create_logger() model = create_model() optimizer = create_optimizer(model.parameters(), new_optimizer_config(cfg)) scheduler = create_scheduler(optimizer, new_scheduler_config(cfg)) # Print model info logging.info(model) logging.info(cfg) cfg.params = params_count(model) logging.info('Num parameters: {}'.format(cfg.params)) # Start training
import argparse from itertools import product import torch from datasets import get_dataset from gcn import GCN from gin import GIN from graph_sage import GraphSAGE from train_eval import eval_acc, train from torch_geometric import seed_everything from torch_geometric.loader import DataLoader from torch_geometric.profile import get_stats_summary, profileit, timeit seed_everything(0) parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--warmup_profile', type=int, default=1, help='Skip the first few runs') parser.add_argument('--goal_accuracy', type=int, default=1, help='The goal test accuracy') args = parser.parse_args() layers = [1, 2, 3]