Пример #1
0
def test_ddp_configure_ddp():
    """Tests with ddp plugin."""
    model = BoringModel()
    ddp_plugin = DDPPlugin()
    trainer = Trainer(
        max_epochs=1,
        strategy=ddp_plugin,
    )
    # test wrap the model if fitting
    trainer.state.fn = TrainerFn.FITTING
    trainer.training_type_plugin.connect(model)
    trainer.accelerator.setup_environment()
    trainer.accelerator.setup(trainer)
    trainer.lightning_module.trainer = trainer
    assert isinstance(trainer.model, LightningModule)
    trainer._pre_dispatch()
    # in DDPPlugin configure_ddp(), model wrapped by DistributedDataParallel
    assert isinstance(trainer.model, DistributedDataParallel)

    trainer = Trainer(
        max_epochs=1,
        strategy=ddp_plugin,
    )
    # test do not wrap the model if trainerFN is not fitting
    trainer.training_type_plugin.connect(model)
    trainer.accelerator.setup_environment()
    trainer.accelerator.setup(trainer)
    trainer.lightning_module.trainer = trainer
    trainer._pre_dispatch()
    # in DDPPlugin configure_ddp(), model are still LightningModule
    assert isinstance(trainer.model, LightningModule)
def test_ddp_post_local_sgd_comm_hook(tmpdir):
    """Test for DDP post-localSGD hook."""
    model = BoringModel()

    training_type_plugin = DDPPlugin(
        ddp_comm_state=post_localSGD.PostLocalSGDState(
            process_group=None,
            subgroup=None,
            start_localSGD_iter=8,
        ),
        ddp_comm_hook=post_localSGD.post_localSGD_hook,
        model_averaging_period=4,
    )
    trainer = Trainer(
        fast_dev_run=True,
        gpus=2,
        strategy=training_type_plugin,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
    )
    trainer.fit(model)
    trainer_comm_hook = trainer.accelerator.training_type_plugin._model.get_ddp_logging_data(
    ).comm_hook
    expected_comm_hook = post_localSGD.post_localSGD_hook.__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert trainer.state.finished, f"Training failed with {trainer.state}"
Пример #3
0
def main():
    args = get_parser().parse_args()
    cfg = get_cfg(args)

    trainloader, valloader = prepare_dataloaders(cfg)
    model = TrainingModule(cfg.convert_to_dict())

    if cfg.PRETRAINED.LOAD_WEIGHTS:
        # Load single-image instance segmentation model.
        pretrained_model_weights = torch.load(
            os.path.join(cfg.DATASET.DATAROOT, cfg.PRETRAINED.PATH), map_location='cpu'
        )['state_dict']

        model.load_state_dict(pretrained_model_weights, strict=False)
        print(f'Loaded single-image model weights from {cfg.PRETRAINED.PATH}')

    save_dir = os.path.join(
        cfg.LOG_DIR, time.strftime('%d%B%Yat%H:%M:%S%Z') + '_' + socket.gethostname() + '_' + cfg.TAG
    )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir=save_dir)
    trainer = pl.Trainer(
        gpus=cfg.GPUS,
        accelerator='ddp',
        precision=cfg.PRECISION,
        sync_batchnorm=True,
        gradient_clip_val=cfg.GRAD_NORM_CLIP,
        max_epochs=cfg.EPOCHS,
        weights_summary='full',
        logger=tb_logger,
        log_every_n_steps=cfg.LOGGING_INTERVAL,
        plugins=DDPPlugin(find_unused_parameters=True),
        profiler='simple',
    )
    trainer.fit(model, trainloader, valloader)
def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
    """Test for DDP FP16 compress wrapper for SGD hook."""
    model = BoringModel()
    training_type_plugin = DDPPlugin(
        ddp_comm_state=powerSGD.PowerSGDState(process_group=None),
        ddp_comm_hook=powerSGD.powerSGD_hook,
        ddp_comm_wrapper=default.fp16_compress_wrapper,
        sync_batchnorm=True,
    )
    trainer = Trainer(
        max_epochs=1,
        gpus=2,
        plugins=[training_type_plugin],
        default_root_dir=tmpdir,
        sync_batchnorm=True,
        fast_dev_run=True,
    )
    trainer.fit(model)
    trainer_comm_hook = (trainer.accelerator.training_type_plugin._model.
                         get_ddp_logging_data().comm_hook)
    expected_comm_hook = default.fp16_compress_wrapper(
        powerSGD.powerSGD_hook).__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert (trainer.state == TrainerState.FINISHED
            ), f"Training failed with {trainer.state}"
Пример #5
0
def cpc_v2_example():
	from pl_bolts.models.self_supervised import CPC_v2
	from pl_bolts.datamodules import CIFAR10DataModule
	from pl_bolts.models.self_supervised.cpc import CPCTrainTransformsCIFAR10, CPCEvalTransformsCIFAR10
	from pytorch_lightning.plugins import DDPPlugin

	# Data module.
	dm = CIFAR10DataModule(num_workers=12, batch_size=32)
	dm.train_transforms = CPCTrainTransformsCIFAR10()
	dm.val_transforms = CPCEvalTransformsCIFAR10()

	# Model.
	model = CPC_v2(encoder="cpc_encoder")

	# Fit.
	trainer = pl.Trainer(gpus=2, accelerator="ddp", plugins=DDPPlugin(find_unused_parameters=False))
	trainer.fit(model, datamodule=dm)

	#--------------------
	# CIFAR-10 pretrained model:
	weight_path = "https://pl-bolts-weights.s3.us-east-2.amazonaws.com/cpc/cpc-cifar10-v4-exp3/epoch%3D474.ckpt"
	# STL-10 pretrained model:
	#weight_path = "https://pl-bolts-weights.s3.us-east-2.amazonaws.com/cpc/cpc-stl10-v0-exp3/epoch%3D624.ckpt"
	cpc_v2 = CPC_v2.load_from_checkpoint(weight_path, strict=False)

	cpc_v2.freeze()
Пример #6
0
def main(cfg):
    trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    model = MelGanModel(cfg=cfg.model, trainer=trainer)
    epoch_time_logger = LogEpochTimeCallback()
    trainer.callbacks.extend([epoch_time_logger])
    trainer.fit(model)
Пример #7
0
def get_trainer(wandb_logger, callbacks, config):

    gpus = []
    if config.gpu0:
        gpus.append(0)
    if config.gpu1:
        gpus.append(1)
    logging.info("gpus active", gpus)
    if len(gpus) >= 2:
        distributed_backend = "ddp"
        accelerator = "dpp"
        plugins = DDPPlugin(find_unused_parameters=False)
    else:
        distributed_backend = None
        accelerator = None
        plugins = None

    trainer = pl.Trainer(
        logger=wandb_logger,
        gpus=gpus,
        max_epochs=config.NUM_EPOCHS,
        precision=config.precision_compute,
        #    limit_train_batches=0.1, #only to debug
        #    limit_val_batches=0.05, #only to debug
        #    val_check_interval=1,
        auto_lr_find=config.AUTO_LR,
        log_gpu_memory=True,
        #    distributed_backend=distributed_backend,
        #    accelerator=accelerator,
        #    plugins=plugins,
        callbacks=callbacks,
        progress_bar_refresh_rate=5,
    )

    return trainer
Пример #8
0
def train_model(model, model_dir):
    # Setup trainer

    cb1 = callbacks.ModelCheckpoint(filename='best-{epoch}',
                                    monitor='val_loss_mean',
                                    save_top_k=1,
                                    mode='min')
    cb2 = callbacks.ModelCheckpoint(filename='last-{epoch}', save_last=True)

    tb_logger = pl_loggers.TensorBoardLogger('{}/logs/'.format(model_dir))
    if Constants.n_gpus != 0:
        #trainer = Trainer(gpus=Constants.n_gpus, distributed_backend='ddp', logger = tb_logger, precision=16, default_root_dir=model_dir, max_epochs=n_epochs)
        trainer = Trainer(gpus=Constants.n_gpus,
                          callbacks=[cb1, cb2],
                          plugins=DDPPlugin(find_unused_parameters=False),
                          accelerator='ddp_spawn',
                          precision=16,
                          logger=tb_logger,
                          default_root_dir=model_dir,
                          max_epochs=n_epochs)
    else:
        trainer = Trainer(gpus=0,
                          default_root_dir=model_dir,
                          callbacks=[cb1, cb2],
                          logger=tb_logger,
                          distributed_backend='ddp_spawn',
                          max_epochs=n_epochs)

    trainer.fit(model)
    trainer.test()
Пример #9
0
def main(args):
    # read args
    # args = parse_args()
    region_path = args.region
    lr = args.LR
    output_folder = args.output
    ref_path = "/mnt/research/compbio/wanglab/jiaxin/proj1_3d_eqtl/data/sequence/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
    #ref_path = full_ref_path if args.ref is None else args.ref
    batch = 100
    epoch = 200

    os.system("mkdir -p {0}".format(output_folder))

    # create data sets
    seq_data = TFData(region_path, ref_path)
    nTF = seq_data.nTF
    frag_len = seq_data.frag_len

    # split the data to training & validation sets
    train_loader, val_loader = dataset_split_loader(seq_data, batch, 0.1)

    # train the model
    # TF_train(epoch, lr, nTF, frag_len, train_loader, val_loader, output_folder)
    model = TFNet(n_TF=nTF, frag_length=frag_len)
    trainer = pl.Trainer(max_epochs=epoch,
                         profiler="simple",
                         plugins=DDPPlugin(find_unused_parameters=False),
                         accelerator="ddp",
                         replace_sampler_ddp=False,
                         gpus=-1,
                         num_nodes=1)

    trainer.fit(model, train_loader, val_loader)
Пример #10
0
def main(dataset, gpus):
    batch_size = 64

    data = NABTraf(batch_size=batch_size, data_path=dataset)
    net = TadGAN(in_size=1,
                 weight_decay=1e-6,
                 iterations_critic=5,
                 lr=0.0005,
                 use_gru=True)
    net.example_input_array = torch.ones(batch_size, 100, 1, dtype=torch.float)
    logger = TensorBoardLogger('logs', name='tadgan', log_graph=True)

    #     early_stop_callback = EarlyStopping(
    #        monitor='F1',
    #        min_delta=0.00,
    #        patience=3,
    #        verbose=True,
    #        mode='max'
    #     )

    trainer = pl.Trainer(
        plugins=[DDPPlugin(find_unused_parameters=True)],
        fast_dev_run=False,
        weights_summary='full',
        log_gpu_memory=True,
        gpus=gpus,
        accelerator='ddp',
        logger=logger,
        check_val_every_n_epoch=5,
        max_epochs=100,
        callbacks=[
            GPUStatsMonitor(),
            # early_stop_callback
        ])
    trainer.fit(net, datamodule=data)
Пример #11
0
def main(cfg: DictConfig) -> None:
    logging.info(f'Config:\n {cfg.pretty()}')
    trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    bert_model = BERTLMModel(cfg.model, trainer=trainer)
    trainer.fit(bert_model)
    if cfg.model.nemo_path:
        bert_model.save_to(cfg.model.nemo_path)
Пример #12
0
def cli_main():
    # ------------
    # args
    # ------------
    parser = cfg.get_parser(add_help=True)
    parser = pl.Trainer.add_argparse_args(parser)
    cfgs = cfg.parse_args(parser)

    # ------------
    # seed
    # ------------
    pl.utilities.seed.seed_everything(cfgs.seed)

    # ------------
    # data
    # ------------
    dm = DataModule(cfgs)

    # ------------
    # model
    # ------------
    model = LitProject(cfgs)
    if cfgs.load_from_checkpoint is not None:
        # Load checkpoint
        ckpt = torch.load(cfgs.load_from_checkpoint)
        missing_keys, unexpected_keys = model.load_state_dict(
            ckpt["state_dict"], strict=False)
        print(
            f"[ckpt] Missing keys: {missing_keys}, Unexpected keys: {unexpected_keys}."
        )
        print(f"[ckpt] Load checkpoint from {cfgs.load_from_checkpoint}.")

    # ------------
    # trainer
    # ------------
    logger = pl.loggers.TestTubeLogger("logs",
                                       name=cfgs.name,
                                       create_git_tag=True)
    trainer = pl.Trainer.from_argparse_args(
        cfgs,
        logger=logger,
        callbacks=callbacks(cfgs),
        # === Training Setting ===
        gpus=-1,
        accelerator="ddp",
        sync_batchnorm=True,
        plugins=DDPPlugin(find_unused_parameters=False, sync_batchnorm=True),
        # === Debug Setting ===
        # profiler="simple",
        # overfit_batches=10,
        # track_grad_norm=2,
    )

    # ------------
    # training
    # ------------
    trainer.fit(model, dm)
Пример #13
0
def main(cfg):
    trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)],
                         **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    model = FastPitchHifiGanE2EModel(cfg=cfg.model, trainer=trainer)
    lr_logger = pl.callbacks.LearningRateMonitor()
    epoch_time_logger = LogEpochTimeCallback()
    trainer.callbacks.extend([lr_logger, epoch_time_logger])
    trainer.fit(model)
Пример #14
0
def main(args):
    # Setup experiment
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported.")

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    args.experiment = args.experiment or f"{args.model.replace('_', '-')}"
    args.experiment = "-".join([args.experiment])
    args.experiment_dir = os.path.join(
        args.output_dir, args.dataset,
        args.experiment + ("-cumulative" if args.all else ""))
    os.makedirs(args.experiment_dir, exist_ok=True)

    tb_writer = TensorBoardLogger(save_dir=args.experiment_dir)
    seed_everything(42, workers=True)

    train_loaders, valid_loaders = data.build_dataset(
        args.dataset,
        args.data_path,
        blurry=args.blurry,
        batch_size=args.batch_size)

    model = GCL(args, train_loaders)
    trainer = Trainer(gpus=-1,
                      distributed_backend='ddp',
                      max_epochs=len(valid_loaders),
                      reload_dataloaders_every_epoch=True,
                      plugins=DDPPlugin(find_unused_parameters=False),
                      logger=tb_writer)

    # for task_id in range(num_tasks):
    # print(f"task:    {task_id}")
    # train_loader = DataLoader(train_loaders[task_id], batch_size=10)

    metrics = MetricCollection(
        [Accuracy(), F1(args.num_classes, average='macro')])
    model.train_metrics = metrics.clone(prefix=f'train{model.curr_index}_')
    trainer.fit(model)
    trainer.train_loop.reset_train_val_dataloaders(model)
    if model.curr_index:
        temp = trainer.train_dataloader
        labels = []
        for batch in temp:
            labels.append(batch[1])
        print("Check if train loader if reloaded", labels)

    #! TEST
    test_loaders = [
        DataLoader(ds, batch_size=len(ds))
        for ds in valid_loaders[:model.curr_index + 1]
    ]
    model.test_metrics = nn.ModuleList(
        [metrics.clone(prefix=f'valid{i}_') for i in range(len(test_loaders))])

    trainer.test(model, test_dataloaders=test_loaders)
Пример #15
0
def main(args):
    # debugging forward pass
    lit_module = LitModule(**vars(args))
    # lit_module = LitModule.load_from_checkpoint(LAST_CKP, **vars(args))

    trainer = pl.Trainer.from_argparse_args(
        args, plugins=DDPPlugin(find_unused_parameters=False))
    # trainer = pl.Trainer(resume_from_checkpoint=LAST_CKP)
    vdm = VideoDataModule(**vars(args))
    trainer.fit(lit_module, vdm)
Пример #16
0
def train_vposer_once(_config):

    resume_training_if_possible = True

    model = VPoserTrainer(_config)
    model.vp_ps.logging.expr_msg = create_expr_message(model.vp_ps)
    # model.text_logger(model.vp_ps.logging.expr_msg.replace(". ", '.\n'))
    dump_config(model.vp_ps,
                osp.join(model.work_dir, '{}.yaml'.format(model.expr_id)))

    logger = TensorBoardLogger(model.work_dir, name='tensorboard')
    lr_monitor = LearningRateMonitor()

    snapshots_dir = osp.join(model.work_dir, 'snapshots')
    checkpoint_callback = ModelCheckpoint(
        dirpath=makepath(snapshots_dir, isfile=True),
        filename="%s_{epoch:02d}_{val_loss:.2f}" % model.expr_id,
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min',
    )
    early_stop_callback = EarlyStopping(
        **model.vp_ps.train_parms.early_stopping)

    resume_from_checkpoint = None
    if resume_training_if_possible:
        available_ckpts = sorted(glob.glob(osp.join(snapshots_dir, '*.ckpt')),
                                 key=os.path.getmtime)
        if len(available_ckpts) > 0:
            resume_from_checkpoint = available_ckpts[-1]
            model.text_logger(
                'Resuming the training from {}'.format(resume_from_checkpoint))

    trainer = pl.Trainer(
        gpus=1,
        weights_summary='top',
        distributed_backend='ddp',
        # replace_sampler_ddp=False,
        # accumulate_grad_batches=4,
        # profiler=False,
        # overfit_batches=0.05,
        # fast_dev_run = True,
        # limit_train_batches=0.02,
        # limit_val_batches=0.02,
        # num_sanity_val_steps=2,
        plugins=[DDPPlugin(find_unused_parameters=False)],
        callbacks=[lr_monitor, early_stop_callback, checkpoint_callback],
        max_epochs=model.vp_ps.train_parms.num_epochs,
        logger=logger,
        resume_from_checkpoint=resume_from_checkpoint)

    trainer.fit(model)
Пример #17
0
def main(args):
    # logger = WandbLogger(project=args.project_name, save_dir=None, log_model=True)
    # model_checkpointer = ModelCheckpoint(dirpath=logger.save_dir, monitor=args.monitor, save_weights_only=True)
    trainer = Trainer.from_argparse_args(
        args,
        # logger=logger,
        # callbacks=[model_checkpointer],
        plugins=DDPPlugin(find_unused_parameters=True)
    )
    celeba = CelebaDataModule.from_argparse_args(args)
    model = VanillaStarGAN.from_argparse_args(args, image_shape=celeba.image_shape, label_names=celeba.attributes)
    trainer.fit(model, datamodule=celeba)
Пример #18
0
def simple_vae_example():
    from pl_bolts.models.autoencoders import VAE
    from pytorch_lightning.plugins import DDPPlugin

    # Data.
    train_dataset = torchvision.datasets.CIFAR10(
        "",
        train=True,
        download=True,
        transform=torchvision.transforms.ToTensor())
    val_dataset = torchvision.datasets.CIFAR10(
        "",
        train=False,
        download=True,
        transform=torchvision.transforms.ToTensor())

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=32,
                                               num_workers=12)
    #train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, num_workers=12, persistent_workers=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=32,
                                             num_workers=12)
    #val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, num_workers=12, persistent_workers=True)

    # Model.
    model = VAE(input_height=32)
    """
	# Override any part of this AE to build your own variation.
	class MyVAEFlavor(VAE):
		def get_posterior(self, mu, std):
			# Do something other than the default.
			P = self.get_distribution(self.prior, loc=torch.zeros_like(mu), scale=torch.ones_like(std))
			return P

	model = MyVAEFlavor(...)
	"""

    # Fit.
    trainer = pl.Trainer(gpus=2,
                         accelerator="ddp",
                         plugins=DDPPlugin(find_unused_parameters=False))
    trainer.fit(model, train_loader, val_loader)

    #--------------------
    # CIFAR-10 pretrained model:
    vae = VAE(input_height=32)
    print(VAE.pretrained_weights_available())
    vae = vae.from_pretrained("cifar10-resnet18")

    vae.freeze()
Пример #19
0
def basic_ae_example():
    from pl_bolts.models.autoencoders import AE
    from pytorch_lightning.plugins import DDPPlugin

    # Data.
    train_dataset = torchvision.datasets.CIFAR10(
        "",
        train=True,
        download=True,
        transform=torchvision.transforms.ToTensor())
    val_dataset = torchvision.datasets.CIFAR10(
        "",
        train=False,
        download=True,
        transform=torchvision.transforms.ToTensor())

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=32,
                                               num_workers=12)
    #train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, num_workers=12, persistent_workers=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=32,
                                             num_workers=12)
    #val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, num_workers=12, persistent_workers=True)

    # Model.
    model = AE(input_height=32)
    """
	# Override any part of this AE to build your own variation.
	class MyAEFlavor(AE):
		def init_encoder(self, hidden_dim, latent_dim, input_width, input_height):
			encoder = YourSuperFancyEncoder(...)
			return encoder

	model = MyAEFlavor(...)
	"""

    # Fit.
    trainer = pl.Trainer(gpus=2,
                         accelerator="ddp",
                         plugins=DDPPlugin(find_unused_parameters=False))
    trainer.fit(model, train_loader, val_loader)

    #--------------------
    # CIFAR-10 pretrained model:
    ae = AE(input_height=32)
    print(AE.pretrained_weights_available())
    ae = ae.from_pretrained("cifar10-resnet18")

    ae.freeze()
def main(cfg: DictConfig):
    seed_everything(42)
    logger = WandbLogger(**cfg.logger)
    logger.log_hyperparams(OmegaConf.to_container(cfg, resolve=True))
    checkpoint = ModelCheckpoint(**cfg.checkpoint, dirpath=logger.save_dir)
    trainer = Trainer(
        **cfg.trainer,
        logger=logger,
        callbacks=checkpoint,
        plugins=DDPPlugin(find_unused_parameters=True)
    )
    task = instantiate(cfg.task)
    datamodule = instantiate(cfg.data)
    trainer.fit(model=task, datamodule=datamodule)
Пример #21
0
def main(args):
    backbone = "bert-base-uncased-itokens"
    tokenizer = BertTokenizerFast.from_pretrained(backbone)

    if args.test:
        model = PretrainedGAN.load_from_checkpoint(args.load_checkpoint,
                                                   args=args,
                                                   tokenizer=tokenizer,
                                                   backbone=backbone)
        model.cuda()
        model.eval()

        model.inference(args.scene_graphs_json)

        return

    # train
    if args.gpus > 1:
        dm = VGDataModule(args, tokenizer, 2)
    else:
        dm = VGDataModule(args, tokenizer)

    if args.load_checkpoint != "":
        model = PretrainedGAN.load_from_checkpoint(args.load_checkpoint,
                                                   args=args,
                                                   tokenizer=tokenizer,
                                                   backbone=backbone)
    else:
        model = PretrainedGAN(args, tokenizer, backbone)

    training_args = {
        "gpus": args.gpus,
        "fast_dev_run": False,
        "max_steps": args.num_iterations,
        "precision": 32,
        "gradient_clip_val": 1,
    }

    if args.gpus > 1:
        additional_args = {
            "accelerator": "ddp",
            "plugins": [DDPPlugin(find_unused_parameters=True)]
            # "plugins": [my_ddp]
        }

        training_args.update(additional_args)

    trainer = pl.Trainer(**training_args)
    trainer.fit(model, dm)
Пример #22
0
def moco_v2_example():
	from pl_bolts.models.self_supervised import Moco_v2
	from pl_bolts.datamodules import CIFAR10DataModule
	from pl_bolts.models.self_supervised.moco import Moco2TrainCIFAR10Transforms, Moco2EvalCIFAR10Transforms
	from pytorch_lightning.plugins import DDPPlugin

	# Data module.
	dm = CIFAR10DataModule(num_workers=12, batch_size=32)
	dm.train_transforms = Moco2TrainCIFAR10Transforms()
	dm.val_transforms = Moco2EvalCIFAR10Transforms()

	# Model.
	model = Moco_v2()

	# Fit.
	trainer = pl.Trainer(gpus=2, accelerator="ddp", plugins=DDPPlugin(find_unused_parameters=False))
	trainer.fit(model, datamodule=dm)
def test_ddp_fp16_compress_comm_hook(tmpdir):
    """Test for DDP FP16 compress hook."""
    model = BoringModel()
    training_type_plugin = DDPPlugin(ddp_comm_hook=default.fp16_compress_hook, sync_batchnorm=True)
    trainer = Trainer(
        max_epochs=1,
        gpus=2,
        strategy=training_type_plugin,
        default_root_dir=tmpdir,
        sync_batchnorm=True,
        fast_dev_run=True,
    )
    trainer.fit(model)
    trainer_comm_hook = trainer.accelerator.training_type_plugin._model.get_ddp_logging_data().comm_hook
    expected_comm_hook = default.fp16_compress_hook.__qualname__
    assert trainer_comm_hook == expected_comm_hook
    assert trainer.state.finished, f"Training failed with {trainer.state}"
Пример #24
0
def init_trainer(cfg):
    cfg_trainer = dict(cfg.pl_trainer)
    if 'logging' in cfg:
        loggers = []
        for _, cfg_log in cfg.logging.items():
            loggers.append(instantiate(cfg_log))
        cfg_trainer['logger'] = loggers
    if cfg.callbacks:
        callbacks = []
        for _, cfg_callback in cfg.callbacks.items():
            callbacks.append(instantiate(cfg_callback))
        cfg_trainer['callbacks'] = callbacks
    if cfg_trainer['accelerator'] == 'ddp' and cfg_trainer['precision'] < 32:
        cfg_trainer['plugins'] = DDPPlugin(find_unused_parameters=False)

    trainer = pl.Trainer(**cfg_trainer)
    return trainer
Пример #25
0
def test_incorrect_ddp_script_spawning(tmpdir):
    """Test an error message when user accidentally instructs Lightning to spawn children processes on rank > 0."""

    class WronglyImplementedEnvironment(LightningEnvironment):
        def creates_children(self):
            # returning false no matter what means Lightning would spawn also on ranks > 0 new processes
            return False

    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        accelerator="ddp",
        num_processes=2,
        plugins=[DDPPlugin(), WronglyImplementedEnvironment()],
    )
    with pytest.raises(
        RuntimeError, match="Lightning attempted to launch new distributed processes with `local_rank > 0`."
    ):
        trainer.fit(model)
Пример #26
0
def mix_and_match_any_part_or_subclass_example():
	from pl_bolts.models.self_supervised import CPC_v2
	from pl_bolts.losses.self_supervised_learning import FeatureMapContrastiveTask
	from pl_bolts.datamodules import CIFAR10DataModule
	from pl_bolts.models.self_supervised.cpc import CPCTrainTransformsCIFAR10, CPCEvalTransformsCIFAR10
	from pytorch_lightning.plugins import DDPPlugin

	# Data module.
	dm = CIFAR10DataModule(num_workers=12, batch_size=32)
	dm.train_transforms = CPCTrainTransformsCIFAR10()
	dm.val_transforms = CPCEvalTransformsCIFAR10()

	# Model.
	amdim_task = FeatureMapContrastiveTask(comparisons="01, 11, 02", bidirectional=True)
	model = CPC_v2(encoder="cpc_encoder", contrastive_task=amdim_task)

	# Fit.
	trainer = pl.Trainer(gpus=2, accelerator="ddp", plugins=DDPPlugin(find_unused_parameters=False))
	trainer.fit(model, datamodule=dm)
def train(args):
    print('Using PyTorch version:', torch.__version__)
    print(torch.__config__.show())

    model = TorchvisionModel(args.model)

    train_dataset = dataset_from_datadir(args.datadir)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batchsize,
                              num_workers=args.workers,
                              pin_memory=True)

    precision = 16 if args.fp16 else 32

    if args.strategy == 'ddp':
        # strategy = 'ddp'
        strategy = DDPPlugin(find_unused_parameters=False)
        # strategy = DDPStrategy(find_unused_parameters=False))

        trainer = pl.Trainer(gpus=args.gpus,
                             num_nodes=args.nodes,
                             max_epochs=args.epochs,
                             accelerator='gpu',
                             strategy=strategy,
                             precision=precision)
    elif args.strategy == 'horovod':
        trainer = pl.Trainer(max_epochs=args.epochs,
                             gpus=1,
                             strategy="horovod")  #,
        #precision=precision)
    else:
        print("ERROR: Unsupported strategy '{}'".format(args.strategy))
        return

    start = datetime.now()
    print("Starting training at", start)
    trainer.fit(model, train_loader)

    dur = datetime.now() - start
    ni = len(train_loader) * args.batchsize
    print("Training completed in: " + str(dur))
    print("Images/sec: {:.4f}".format(ni / dur.total_seconds()))
Пример #28
0
def main():
    system = configure_system(
        hyperparameter_defaults["system"])(hyperparameter_defaults)
    logger = TensorBoardLogger(
        'experiments_logs',
        name=str(hyperparameter_defaults['system']) + "_" +
        str(system.model.__class__.__name__) + "_" +
        str(hyperparameter_defaults['criterion']) + "_" +
        str(hyperparameter_defaults['scheduler']))

    early_stop = EarlyStopping(monitor="valid_iou",
                               mode="max",
                               verbose=True,
                               patience=hyperparameter_defaults["patience"])
    model_checkpoint = ModelCheckpoint(
        monitor="valid_iou",
        mode="max",
        verbose=True,
        filename='Model-{epoch:02d}-{valid_iou:.5f}',
        save_top_k=3,
        save_last=True)
    trainer = pl.Trainer(
        gpus=[0, 1],
        plugins=DDPPlugin(find_unused_parameters=True),
        max_epochs=hyperparameter_defaults['epochs'],
        logger=logger,
        check_val_every_n_epoch=1,
        accelerator='ddp',
        callbacks=[early_stop, model_checkpoint],
        num_sanity_val_steps=0,
        limit_train_batches=1.0,
        deterministic=True,
    )

    trainer.fit(system)
    trainer.test(system)
Пример #29
0
def run(cfg: DictConfig):
    local_rank = int(os.environ.get('LOCAL_RANK', 0))

    # The logs & checkpoints are dumped in: ${cfg.output_dir}/${cfg.experiment_name}/vN, where vN
    # is v0, v1, .... The version number increases automatically.
    script_dir = Path.cwd()
    experiment_dir = script_dir / cfg.output_dir / cfg.experiment_name
    experiment_dir.mkdir(parents=True, exist_ok=True)
    existing_ver = list()
    for d in experiment_dir.iterdir():
        if d.name.startswith('v') and d.name[1:].isdecimal() and d.is_dir():
            existing_ver.append(int(d.name[1:]))
    if local_rank == 0:
        current_ver = max(existing_ver) + 1 if existing_ver else 0
        output_dir = experiment_dir / f'v{current_ver}'
        output_dir.mkdir()
    else:
        # Use the same directory for output with the main process.
        current_ver = max(existing_ver)
        output_dir = experiment_dir / f'v{current_ver}'

    pl_logger = logging.getLogger('lightning')
    logging.config.fileConfig(
        script_dir / 'logging.conf',
        disable_existing_loggers=False,
        defaults={'log_filename': output_dir / f'run_rank{local_rank}.log'})
    # Only the process with LOCAL_RANK = 0 will print logs on the console.
    # And all the processes will print logs in their own log files.
    if local_rank != 0:
        root_logger = logging.getLogger()
        root_logger.removeHandler(root_logger.handlers[0])

    pl_logger.info(f'Output logs & checkpoints in: {output_dir}')
    # Dump experiment configurations for reproducibility
    if local_rank == 0:
        with open(output_dir / 'cfg.yaml', 'w') as yaml_file:
            yaml_file.write(OmegaConf.to_yaml(cfg))
    pl_logger.info('The final experiment setup is dumped as: ./cfg.yaml')

    pl.seed_everything(cfg.seed, workers=True)

    # Create model
    net = load_obj(cfg.model.class_name,
                   'torchvision.models')(**cfg.model.params)
    pl_logger.info(
        f'Create model "{type(net)}". You can view its graph using TensorBoard.'
    )

    # Inject quantizers into the model
    net = nz.quantizer_inject(net, cfg.quan)
    quan_cnt, quan_dict = nz.quantizer_stat(net)
    msg = f'Inject {quan_cnt} quantizers into the model:'
    for k, v in quan_dict.items():
        msg += f'\n                {k} = {len(v)}'
    yaml.safe_dump(quan_dict, open(output_dir / 'quan_stat.yaml', 'w'))
    pl_logger.info(msg)
    pl_logger.info(
        'A complete list of injected quantizers is dumped as: ./quan_stat.yaml'
    )

    # Prepare the dataset
    dm = apputil.get_datamodule(cfg)
    pl_logger.info(
        f'Prepare the "{cfg.dataset.name}" dataset from: {cfg.dataset.data_dir}'
    )
    msg = f'The dataset samples are split into three sets:' \
          f'\n         Train = {len(dm.train_dataloader())} batches (batch size = {dm.train_dataloader().batch_size})' \
          f'\n           Val = {len(dm.val_dataloader())} batches (batch size = {dm.val_dataloader().batch_size})' \
          f'\n          Test = {len(dm.test_dataloader())} batches (batch size = {dm.test_dataloader().batch_size})'
    pl_logger.info(msg)

    progressbar_cb = apputil.ProgressBar(pl_logger)
    # gpu_stats_cb = pl.callbacks.GPUStatsMonitor()

    if cfg.checkpoint.path:
        assert Path(cfg.checkpoint.path).is_file(
        ), f'Checkpoint path is not a file: {cfg.checkpoint.path}'
        pl_logger.info(
            f'Resume training checkpoint from: {cfg.checkpoint.path}')

    if cfg.eval:
        pl_logger.info('Training process skipped. Evaluate the resumed model.')
        assert cfg.checkpoint.path is not None, 'Try to evaluate the model resumed from the checkpoint, but got None'

        # Initialize the Trainer
        trainer = pl.Trainer(callbacks=[progressbar_cb], **cfg.trainer)
        pl_logger.info(
            f'The model is distributed to {trainer.num_gpus} GPUs with {cfg.trainer.accelerator} backend.'
        )

        pretrained_lit = LitModuleWrapper.load_from_checkpoint(
            checkpoint_path=cfg.checkpoint.path, model=net, cfg=cfg)
        trainer.test(pretrained_lit, datamodule=dm, verbose=False)
    else:  # train + eval
        tb_logger = TensorBoardLogger(output_dir / 'tb_runs',
                                      name=cfg.experiment_name,
                                      log_graph=True)
        pl_logger.info('Tensorboard logger initialized in: ./tb_runs')

        lr_monitor_cb = pl.callbacks.LearningRateMonitor()
        checkpoint_cb = pl.callbacks.ModelCheckpoint(
            dirpath=output_dir / 'checkpoints',
            filename='{epoch}-{val_loss_epoch:.4f}-{val_acc_epoch:.4f}',
            monitor='val_loss_epoch',
            mode='min',
            save_top_k=3,
            save_last=True)
        pl_logger.info(
            'Checkpoints of the best 3 models as well as the last one will be saved to: ./checkpoints'
        )

        # Wrap model with LightningModule
        lit = LitModuleWrapper(net, cfg)
        # A fake input array for TensorBoard to generate graph
        lit.example_input_array = t.rand(dm.size()).unsqueeze(dim=0)

        # Initialize the Trainer
        trainer = pl.Trainer(
            logger=[tb_logger],
            callbacks=[checkpoint_cb, lr_monitor_cb, progressbar_cb],
            resume_from_checkpoint=cfg.checkpoint.path,
            plugins=DDPPlugin(find_unused_parameters=False),
            **cfg.trainer)
        pl_logger.info(
            f'The model is distributed to {trainer.num_gpus} GPUs with {cfg.trainer.accelerator} backend.'
        )

        pl_logger.info('Training process begins.')
        trainer.fit(model=lit, datamodule=dm)

        pl_logger.info('Evaluate the best trained model.')
        trainer.test(datamodule=dm, ckpt_path='best', verbose=False)

    pl_logger.info('Program completed successfully. Exiting...')
    pl_logger.info(
        'If you have any questions or suggestions, please visit: github.com/zhutmost/neuralzip'
    )
Пример #30
0
    checkpoint_callback = ModelCheckpoint(
        dirpath=args.save_dir,
        filename='{epoch}-{val_loss:.3f}-{train_loss:.3f}',
        save_top_k=-1)
    logger = CometLogger(
        api_key="YOUR-API-KEY",
        project_name=proj_name,
    )

    model = lit_gazetrack_model(args.dataset_dir, args.save_dir,
                                args.batch_size, logger)
    if (args.checkpoint):
        if (args.gpus == 0):
            w = torch.load(args.checkpoint,
                           map_location=torch.device('cpu'))['state_dict']
        else:
            w = torch.load(args.checkpoint)['state_dict']
        model.load_state_dict(w)
        print("Loaded checkpoint")

    trainer = pl.Trainer(gpus=args.gpus,
                         logger=logger,
                         accelerator="ddp",
                         max_epochs=args.epochs,
                         default_root_dir=args.save_dir,
                         progress_bar_refresh_rate=1,
                         callbacks=[checkpoint_callback],
                         plugins=DDPPlugin(find_unused_parameters=False))
    trainer.fit(model)
    print("DONE")