def test_check_val_every_n_epoch_with_max_steps(tmpdir): data_samples_train = 2 check_val_every_n_epoch = 3 max_epochs = 4 class TestModel(BoringModel): def __init__(self): super().__init__() self.validation_called_at_step = set() def validation_step(self, *args): self.validation_called_at_step.add(self.global_step) return super().validation_step(*args) def train_dataloader(self): return DataLoader(RandomDataset(32, data_samples_train)) model = TestModel() trainer = Trainer( default_root_dir=tmpdir, max_steps=data_samples_train * max_epochs, check_val_every_n_epoch=check_val_every_n_epoch, num_sanity_val_steps=0, ) trainer.fit(model) assert trainer.current_epoch == max_epochs assert trainer.global_step == max_epochs * data_samples_train assert list(model.validation_called_at_step) == [ data_samples_train * check_val_every_n_epoch ]
def test_check_val_every_n_epoch(tmpdir, max_epochs, expected_val_loop_calls, expected_val_batches): class TestModel(BoringModel): val_epoch_calls = 0 val_batches = [] def on_train_epoch_end(self, *args, **kwargs): self.val_batches.append( self.trainer.progress_bar_callback.total_val_batches) def on_validation_epoch_start(self) -> None: self.val_epoch_calls += 1 model = TestModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=max_epochs, num_sanity_val_steps=0, limit_val_batches=2, check_val_every_n_epoch=2, logger=False, ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" assert model.val_epoch_calls == expected_val_loop_calls assert model.val_batches == expected_val_batches
def test(): saved_model_path = './model.pth' # lightning_logs/version_N/checkpoints/* 最优模型软链接 model = BertClassifier.load_from_checkpoint(saved_model_path) model.eval() print(model) trainer = Trainer(gpus=1) result = trainer.test(model) print(result)
def main(): parser = ArgumentParser() parser.add_argument("--model_file", type=str, default="", required=True, help="Pass path to model's .nemo file") parser.add_argument("--prompt", type=str, default="", required=True, help="Prompt for the model (a text to complete)") parser.add_argument("--tokens_to_generate", type=int, default="16", required=False, help="How many tokens to add to prompt") parser.add_argument( "--tensor_model_parallel_size", type=int, default=1, required=True, ) args = parser.parse_args() torch.set_grad_enabled(False) # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), devices=args.tensor_model_parallel_size, precision=16, accelerator='gpu') app_state = AppState() if args.tensor_model_parallel_size > 1: app_state.model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_rank = compute_model_parallel_rank( trainer.local_rank, app_state.model_parallel_size) model = MegatronT5Model.restore_from(restore_path=args.model_file, trainer=trainer) model.freeze() request = { "prompt": args.prompt, "tokens_to_generate": args.tokens_to_generate, } dataset = T5RequestDataset(request, model.tokenizer) request_dl = DataLoader(dataset) response = trainer.predict(model, request_dl) print("***************************") print(response) print("***************************")
def convert(local_rank, rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 num_nodes = world_size // args.gpus_per_node if args.bcp: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()]) else: trainer = Trainer(devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu') app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size app_state.tensor_model_parallel_size = args.tensor_model_parallel_size app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size parallel_state.initialize_model_parallel( tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, ) app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank( ) app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank( ) # inject model parallel rank checkpoint_path = inject_model_parallel_rank( os.path.join(args.checkpoint_folder, args.checkpoint_name)) logging.info( f'rank: {rank}, local_rank: {local_rank}, is loading checkpoint: {checkpoint_path} for tp_rank: {app_state.tensor_model_parallel_rank} and pp_rank: {app_state.pipeline_model_parallel_rank}' ) if args.model_type == 'gpt': model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'bert': model = MegatronBertModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 't5': model = MegatronT5Model.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'nmt': model = MegatronNMTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() if torch.distributed.is_initialized(): torch.distributed.barrier() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def setup_method(self, test_method): trainer_config = { "devices": 1, "num_nodes": 1, "accelerator": "gpu", "logger": False, "precision": 16, } tensor_model_parallel_size = 1 pipeline_model_parallel_size = 1 model_file = '/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), **trainer_config) assert ( trainer_config["devices"] * trainer_config['num_nodes'] == tensor_model_parallel_size * pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" model = MegatronGPTModel.restore_from(restore_path=model_file, trainer=trainer) model.freeze() # has to turn off activations_checkpoint_method for inference try: model.model.language_model.encoder.activations_checkpoint_method = None except AttributeError: pass self.model = model
def convert(rank, world_size, args): app_state = AppState() app_state.data_parallel_rank = 0 trainer = Trainer(gpus=args.tensor_model_parallel_size) # TODO: reach out to PTL For an API-safe local rank override trainer.accelerator.training_type_plugin._local_rank = rank if args.tensor_model_parallel_size is not None and args.tensor_model_parallel_size > 1: # inject model parallel rank checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{rank:02d}', args.checkpoint_name) else: checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name) if args.model_type == 'gpt': model = MegatronGPTModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 'bert': model = MegatronBertModel.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) elif args.model_type == 't5': model = MegatronT5Model.load_from_checkpoint( checkpoint_path, hparams_file=args.hparams_file, trainer=trainer) model._save_restore_connector = NLPSaveRestoreConnector() if torch.distributed.is_initialized(): torch.distributed.barrier() model.save_to(args.nemo_file_path) logging.info(f'NeMo model saved to: {args.nemo_file_path}')
def setup_class(cls): if not torch.cuda.is_available(): return GPUS = 1 plugins = [NLPDDPPlugin()] TP_SIZE = GPUS PP_SIZE = 1 MB_SIZE = 4 GB_SIZE = 8 SEED = 1234 trainer = Trainer( plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None ) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=TP_SIZE, pipeline_model_parallel_size=PP_SIZE, micro_batch_size=MB_SIZE, global_batch_size=GB_SIZE, seed=SEED, apex_transformer_log_level=30, ) def dummy(): return if trainer.strategy.launcher is not None: trainer.strategy.launcher.launch(dummy, trainer=trainer) trainer.strategy.setup_environment() torch.distributed.barrier()
def test_loops_state_dict(): trainer = Trainer() trainer.train_dataloader = Mock() fit_loop = FitLoop() with pytest.raises(MisconfigurationException, match="Loop FitLoop should be connected to a"): fit_loop.trainer = object() fit_loop.trainer = trainer fit_loop.connect(Mock()) state_dict = fit_loop.state_dict() new_fit_loop = FitLoop() new_fit_loop.trainer = trainer new_fit_loop.load_state_dict(state_dict) assert fit_loop.state_dict() == new_fit_loop.state_dict()
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=2000) checkpoint = ModelCheckpoint( filepath='exp/lightning_logs/version_2000/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, accumulate_grad_batches=4, checkpoint_callback=checkpoint, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path='exp/', val_check_interval=0.3, log_save_interval=50000, row_log_interval=50000, gpus=1, val_percent_check=1, # distributed_backend='dp', nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, precision=16, nb_sanity_val_steps=0, progress_bar_refresh_rate=1, resume_from_checkpoint= 'exp/lightning_logs/version_2000/checkpoints/epoch=114_v1.ckpt') # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def train(): from argparse import ArgumentParser parser = ArgumentParser() parser = BertClassifier.add_model_specific_args(parser) parser = Trainer.add_argparse_args(parser) args = parser.parse_args() model = BertClassifier(batch_size=args.batch_size, learning_rate=args.learning_rate, early_stop=args.early_stop) print(model) early_stopping = EarlyStopping('val_loss') trainer = Trainer.from_argparse_args(args, callbacks=[early_stopping], precision=16, gpus=1, max_epochs=30) trainer.fit(model)
def main(cfg) -> None: # trainer required for restoring model parallel models trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" app_state = AppState() app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, tensor_model_parallel_size_=cfg.tensor_model_parallel_size, pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, ) if cfg.model_file is not None: if not os.path.exists(cfg.model_file): raise ValueError(f"Model file {cfg.model_file} does not exist") model = MegatronNMTModel.restore_from( restore_path=cfg.model_file, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(), ) elif cfg.checkpoint_dir is not None: checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) model = MegatronNMTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) else: raise ValueError("need at least a nemo file or checkpoint dir") model.freeze() logging.info(f"Translating: {cfg.srctext}") src_text = [] translations = [] with open(cfg.srctext, 'r') as src_f, open(cfg.tgtout, 'w') as tgt_f: for line in src_f: src_text.append(line.strip()) if len(src_text) == cfg.batch_size: translations = model.translate( text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang, ) for translation in translations: tgt_f.write(translation + "\n") src_text = [] if len(src_text) > 0: translations = model.translate(text=src_text, source_lang=cfg.source_lang, target_lang=cfg.target_lang,) for translation in translations: tgt_f.write(translation + "\n")
def score(self, src: List[str], cand: List[str], ref: List[str]) -> COMETResult: data = {"src": src, "mt": cand, "ref": ref} data = [dict(zip(data, t)) for t in zip(*data.values())] dataloader = DataLoader( dataset=data, batch_size=16, collate_fn=lambda x: self.model.prepare_sample(x, inference=True), num_workers=4, ) cuda = 1 if torch.cuda.is_available() else 0 trainer = Trainer(gpus=cuda, deterministic=True, logger=False) predictions = trainer.predict(self.model, dataloaders=dataloader, return_predictions=True) scores = torch.cat(predictions, dim=0).tolist() return COMETResult( sum(scores) / len(scores), scores, src, cand, ref, self.name, self.modelname)
def main(hparams): data_path = os.environ['HOME'] + '/data/asr_data/' model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=1020) checkpoint = ModelCheckpoint(filepath=data_path + '/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, checkpoint_callback=checkpoint, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path=data_path, val_check_interval=1.0, log_save_interval=100, row_log_interval=10, gpus=1, precision=16, distributed_backend='dp', nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, amp_level='O1', nb_sanity_val_steps=0, log_gpu_memory='all') # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def test_loops_state_dict(): fit_loop = FitLoop() with pytest.raises(MisconfigurationException, match="Loop FitLoop should be connected to a"): fit_loop.connect(object()) # noqa fit_loop.connect(Trainer()) state_dict = fit_loop.state_dict() new_fit_loop = FitLoop() new_fit_loop.load_state_dict(state_dict) assert fit_loop.state_dict() == new_fit_loop.state_dict()
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=4000) checkpoint = ModelCheckpoint( filepath='exp/lightning_logs/version_4000/checkpoints/', monitor='val_mer', verbose=1, save_top_k=-1) trainer = Trainer( logger=logger, early_stop_callback=False, checkpoint_callback=checkpoint, accumulate_grad_batches=8, # checkpoint_callback=checkpoint, # fast_dev_run=True, # overfit_pct=0.03, # profiler=True, default_save_path='exp/', val_check_interval=1.0, log_save_interval=50000, row_log_interval=50000, gpus=1, nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, use_amp=True, amp_level='O1', nb_sanity_val_steps=0) # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def main(hparams): model = LightningModel(hparams) if hparams.seed is not None: random.seed(hparams.seed) t.manual_seed(hparams.seed) cudnn.deterministic = True exp_root = 'exp' log_folder = 'lightning_logs' log_root = os.path.join(exp_root, log_folder) logger = TestTubeLogger(exp_root, name=log_folder, version=5005) checkpoint = ModelCheckpoint(filepath='exp/lightning_logs/version_5005/checkpoints/', monitor='val_loss', verbose=True, save_top_k=-1, mode='min') trainer = Trainer( logger=logger, nb_sanity_val_steps=5, early_stop_callback=False, checkpoint_callback=checkpoint, accumulate_grad_batches=8, progress_bar_refresh_rate=10, default_save_path='exp/', val_check_interval=1.0, log_save_interval=50000, row_log_interval=50000, # gpus=1, nb_gpu_nodes=hparams.nb_gpu_nodes, max_nb_epochs=hparams.epochs, gradient_clip_val=5.0, min_nb_epochs=3000, gpus=1, # num_nodes=1, # distributed_backend='dp', use_amp=False, precision=32, # amp_level='O1', resume_from_checkpoint='exp/lightning_logs/version_5005/checkpoints/epoch=108.ckpt' ) # if hparams.evaluate: # trainer.run_evaluation() # else: trainer.fit(model)
def test_loops_state_dict(): trainer = Trainer() fit_loop = FitLoop() fit_loop.trainer = trainer state_dict = fit_loop.state_dict() new_fit_loop = FitLoop() new_fit_loop.trainer = trainer new_fit_loop.load_state_dict(state_dict) assert fit_loop.state_dict() == new_fit_loop.state_dict()
def tune_model(config, ptl_model, dset, train_inds, n_workers, n_val = None, val_inds = None, tune_metrics = None, mode = 'tune', **trainer_kwargs): ''' A generic function to hp-tuning and model training with ray and pytorch-lightning ''' model = ptl_model(config = config) if val_inds is None: shuffle(train_inds) train_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[n_val:] if val_inds is None else train_inds), batch_size = config['batch_size'], num_workers = n_workers, drop_last = True, shuffle = True ) val_dl = DataLoader( torch.utils.data.Subset(dset, train_inds[:n_val] if val_inds is None else val_inds), num_workers = n_workers, batch_size = config['batch_size'], drop_last = True, shuffle = False ) callbacks = model.callbacks if mode == 'tune': callbacks += [ TuneReportCallback( tune_metrics, on = 'validation_end' ) ] trainer = PLTrainer(callbacks = callbacks, **trainer_kwargs) trainer.fit(model, train_dl, val_dl) return trainer
def test_loops_state_dict_structure(): trainer = Trainer() # structure saved by the checkpoint connector state_dict = { "fit_loop": trainer.fit_loop.state_dict(), "validate_loop": trainer.validate_loop.state_dict(), "test_loop": trainer.test_loop.state_dict(), "predict_loop": trainer.predict_loop.state_dict(), } expected = { "fit_loop": { 'epoch_loop': { 'batch_loop': {}, 'val_loop': {}, } }, "validate_loop": {}, "test_loop": {}, "predict_loop": {}, } assert state_dict == expected