def run( dataset, pl_model: pl.LightningModule, name: str, path: Union[Path, str], test_path: Union[Path, str], seed: int, args=args, ) -> None: seed_everything(seed, workers=True) datamodule: pl.LightningDataModule = DataModule( dataset=dataset, path=path, test_path=test_path, num_workers=8, batch_size=args.batch_size, seed=seed, ) model: pl.LightningModule = pl_model() callbacks: list[Callback] = build_callbacks() csv_logger = CSVLogger( save_dir="csv_logs", name="seed_" + str(seed), version=name, ) if args.fast_dev_run: trainer_kwargs = {"gpus": None, "auto_select_gpus": False} else: trainer_kwargs = { "gpus": -1, "auto_select_gpus": True, "precision": 16 } trainer: pl.Trainer = pl.Trainer.from_argparse_args( args, **trainer_kwargs, deterministic=True, # ensure reproducible results default_root_dir="ckpts", logger=[csv_logger], log_every_n_steps=10, callbacks=callbacks, max_epochs=35, ) trainer.tune(model=model, datamodule=datamodule) trainer.fit(model=model, datamodule=datamodule) if not args.fast_dev_run: test = trainer.test(model=model, ckpt_path="best", datamodule=datamodule) pd.DataFrame(test).to_csv("csv_logs/seed_" + str(seed) + "_" + name + "_test.csv") csv_logger.save() if args.save_to_hub: model.model.push_to_hub(f"cjber/{args.save_to_hub}") # type: ignore
def test_v1_8_0_deprecated_agg_and_log_metrics_override(tmpdir): class AggregationOverrideLogger(CSVLogger): @rank_zero_only def agg_and_log_metrics(self, metrics, step): self.log_metrics(metrics=metrics, step=step) logger = AggregationOverrideLogger(tmpdir) logger2 = CSVLogger(tmpdir) logger3 = CSVLogger(tmpdir) # Test single loggers with pytest.deprecated_call( match="`Logger.agg_and_log_metrics` is deprecated in v1.6 and will be removed" " in v1.8. `Trainer` will directly call `Logger.log_metrics` so custom" " loggers should not implement `Logger.agg_and_log_metrics`." ): Trainer(logger=logger) # Should have no deprecation warning Trainer(logger=logger2) # Test multiple loggers with pytest.deprecated_call( match="`Logger.agg_and_log_metrics` is deprecated in v1.6 and will be removed" " in v1.8. `Trainer` will directly call `Logger.log_metrics` so custom" " loggers should not implement `Logger.agg_and_log_metrics`." ): Trainer(logger=[logger, logger3]) # Should have no deprecation warning Trainer(logger=[logger2, logger3])
def test_file_logger_no_name(tmpdir, name): """Verify that None or empty name works.""" logger = CSVLogger(save_dir=tmpdir, name=name) logger.save() assert os.path.normpath( logger.root_dir) == tmpdir # use os.path.normpath to handle trailing / assert os.listdir(tmpdir / "version_0")
def test_version(tmpdir): """Verify versions of loggers are concatenated properly.""" logger1 = CSVLogger(tmpdir, version=0) logger2 = CSVLogger(tmpdir, version=2) logger3 = CSVLogger(tmpdir, version=1) logger4 = CSVLogger(tmpdir, version=0) loggers = [logger1, logger2, logger3, logger4] version = _version([]) assert version == "" version = _version([logger3]) assert version == 1 version = _version(loggers) assert version == "0_2_1" version = _version(loggers, "-") assert version == "0-2-1"
def test_name(tmpdir): """Verify names of loggers are concatenated properly.""" logger1 = CSVLogger(tmpdir, name="foo") logger2 = CSVLogger(tmpdir, name="bar") logger3 = CSVLogger(tmpdir, name="foo") logger4 = CSVLogger(tmpdir, name="baz") loggers = [logger1, logger2, logger3, logger4] name = _name([]) assert name == "" name = _name([logger3]) assert name == "foo" name = _name(loggers) assert name == "foo_bar_baz" name = _name(loggers, "-") assert name == "foo-bar-baz"
def test_gpu_stats_monitor(tmpdir): """Test GPU stats are logged using a logger.""" model = BoringModel() with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): gpu_stats = GPUStatsMonitor(intra_step_time=True) logger = CSVLogger(tmpdir) log_every_n_steps = 2 trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, limit_train_batches=7, log_every_n_steps=log_every_n_steps, gpus=1, callbacks=[gpu_stats], logger=logger, ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=",", names=True, deletechars="", replace_space=" ") batch_time_data = met_data["batch_time/intra_step (ms)"] batch_time_data = batch_time_data[~np.isnan(batch_time_data)] assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] for f in fields: assert any(f in h for h in met_data.dtype.names)
def train(config_file="pipeline_config.yaml"): logging.info(headline("Step 1: Running metric learning training")) with open(config_file) as file: all_configs = yaml.load(file, Loader=yaml.FullLoader) common_configs = all_configs["common_configs"] metric_learning_configs = all_configs["metric_learning_configs"] logging.info(headline("a) Initialising model")) model = LayerlessEmbedding(metric_learning_configs) logging.info(headline("b) Running training")) save_directory = os.path.join(common_configs["artifact_directory"], "metric_learning") logger = CSVLogger(save_directory, name=common_configs["experiment_name"]) trainer = Trainer(accelerator='gpu' if torch.cuda.is_available() else None, gpus=common_configs["gpus"], max_epochs=metric_learning_configs["max_epochs"], logger=logger) trainer.fit(model) logging.info(headline("c) Saving model")) os.makedirs(save_directory, exist_ok=True) trainer.save_checkpoint( os.path.join(save_directory, common_configs["experiment_name"] + ".ckpt")) return trainer, model
def train(config_file="pipeline_config.yaml"): logging.info(headline(" Step 3: Running GNN training ")) with open(config_file) as file: all_configs = yaml.load(file, Loader=yaml.FullLoader) common_configs = all_configs["common_configs"] gnn_configs = all_configs["gnn_configs"] logging.info(headline("a) Initialising model")) model = InteractionGNN(gnn_configs) logging.info(headline("b) Running training")) save_directory = os.path.join(common_configs["artifact_directory"], "gnn") logger = CSVLogger(save_directory, name=common_configs["experiment_name"]) trainer = Trainer(gpus=common_configs["gpus"], max_epochs=gnn_configs["max_epochs"], logger=logger) trainer.fit(model) logging.info(headline("c) Saving model")) os.makedirs(save_directory, exist_ok=True) trainer.save_checkpoint( os.path.join(save_directory, common_configs["experiment_name"] + ".ckpt")) return trainer, model
def test_gpu_stats_monitor(tmpdir): """ Test GPU stats are logged using a logger. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor() logger = CSVLogger(tmpdir) trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=1, callbacks=[gpu_stats], logger=logger) results = trainer.fit(model) assert results path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) with open(path_csv, 'r') as fp: lines = fp.readlines() header = lines[0].split() fields = [ 'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory' ] for f in fields: assert any([f in h for h in header])
def test_v1_8_0_logger_collection(tmpdir): logger1 = CSVLogger(tmpdir) logger2 = CSVLogger(tmpdir) trainer1 = Trainer(logger=logger1) trainer2 = Trainer(logger=[logger1, logger2]) # Should have no deprecation warning trainer1.logger trainer1.loggers trainer2.loggers with pytest.deprecated_call(match="logger` will return the first logger"): _ = trainer2.logger with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): _ = LoggerCollection([logger1, logger2])
def test_xla_stats_monitor(tmpdir): """Test XLA stats are logged using a logger.""" model = BoringModel() xla_stats = XLAStatsMonitor() logger = CSVLogger(tmpdir) trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=5, tpu_cores=8, callbacks=[xla_stats], logger=logger) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=',', names=True, deletechars='', replace_space=' ') fields = ['avg. free memory (MB)', 'avg. peak memory (MB)'] for f in fields: assert any(f in h for h in met_data.dtype.names)
def main(params): exp_name = f'{params.model}-{params.img_size}-fold-{params.fold}' checkpoint = ModelCheckpoint( dirpath=f'logs/{exp_name}', filename='{epoch}-{valid_loss_epoch:.3f}', save_top_k=-1, verbose=False, ) printer = MyPrintingCallback() logger = CSVLogger(save_dir=f"logs/{exp_name}", name="text_logs") wandb_logger = WandbLogger(name=exp_name, project='all-data') bar = LitProgressBar() lr_monitor = LearningRateMonitor(logging_interval='step') model = HPALit(params) trainer = pl.Trainer( progress_bar_refresh_rate=1, max_epochs=params.epochs, callbacks=[checkpoint, printer, bar, lr_monitor], logger=[logger, wandb_logger], gpus=1, num_sanity_val_steps=0, auto_lr_find=True, ) trainer.tune(model) trainer.fit(model)
def main(args, model=None) -> SummarizationModule: Path(args.output_dir).mkdir(exist_ok=True) check_output_dir(args, expected_items=3) if model is None: if "summarization" in args.task: model: SummarizationModule = SummarizationModule(args) else: model: SummarizationModule = TranslationModule(args) dataset = Path(args.data_dir).name if ( args.logger_name == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var") ): from pytorch_lightning.loggers import CSVLogger logger = CSVLogger('chen_logs',name = 'SCHWEIGEN') # don't pollute wandb logs unnecessarily elif args.logger_name == "wandb": from pytorch_lightning.loggers import WandbLogger project = os.environ.get("WANDB_PROJECT", dataset) logger = WandbLogger(name=model.output_dir.name, project=project) elif args.logger_name == "wandb_shared": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") if args.early_stopping_patience >= 0: es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience) else: es_callback = False lower_is_better = args.val_metric == "loss" trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback( args.output_dir, model.val_metric, args.save_top_k, lower_is_better ), early_stopping_callback=es_callback, logger=logger, ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model model.hparams.test_checkpoint = "" checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) # test() without a model tests using the best checkpoint automatically trainer.test() return model
def train_mult(config, checkpoint_dir=None): hyp_params.attn_dropout = config["attn_dropout"] hyp_params.attn_dropout_a = config["attn_dropout_a"] hyp_params.attn_dropout_v = config["attn_dropout_v"] hyp_params.embed_dropout = config["embed_dropout"] hyp_params.out_dropout = config["out_dropout"] hyp_params.relu_dropout = config["relu_dropout"] hyp_params.res_dropout = config["res_dropout"] # hyp_params.layers = int(config["layers"]) # hyp_params.num_heads = int(config["num_heads"]) # hyp_params.project_dim = int(config["num_heads"]) * int(config["head_dim"]) hyp_params.lr = config["lr"] hyp_params.weight_decay = config["weight_decay"] comet_logger = CometLogger( api_key="cgss7piePhyFPXRw1J2uUEjkQ", workspace="transformer", project_name=hyp_params.project_name, save_dir="logs/comet_ml", ) experiement_key = comet_logger.experiment.get_key() csv_logger = CSVLogger("logs/csv", name=experiement_key) early_stopping = EarlyStopping( monitor="valid_1mae", patience=10, verbose=True, mode="max" ) checkpoint = ModelCheckpoint(save_top_k=1, monitor="valid_1mae", mode="max") # tune_reporter = TuneReportCallback(["valid_loss", "valid_1mae"]) tune_checkpoint_reporter = TuneReportCheckpointCallback( metrics=["valid_loss", "valid_1mae"] ) model = MULTModelWarpedAll(hyp_params, early_stopping=early_stopping) trainer = pl.Trainer( gpus=1, max_epochs=hyp_params.num_epochs, log_every_n_steps=1, callbacks=[early_stopping, checkpoint, tune_checkpoint_reporter], logger=[csv_logger, comet_logger], limit_train_batches=hyp_params.limit, limit_val_batches=hyp_params.limit, weights_summary="full", weights_save_path="logs/weights", progress_bar_refresh_rate=0, ) if checkpoint_dir is not None: ck = th.load(os.path.join(checkpoint_dir, "checkpoint")) model.load_state_dict(ck["state_dict"]) trainer.current_epoch = ck["epoch"] trainer.fit(model) ck = th.load(checkpoint.best_model_path) model.load_state_dict(ck["state_dict"]) trainer.test(model)
def test_file_logger_automatic_versioning(tmpdir): """Verify that automatic versioning works""" root_dir = tmpdir.mkdir("exp") root_dir.mkdir("version_0") root_dir.mkdir("version_1") logger = CSVLogger(save_dir=tmpdir, name="exp") assert logger.version == 2
def train_model(model, train, test, exp_name, epochs=100): pl.seed_everything(42) logger = CSVLogger("logs", name=exp_name) trainer = pl.Trainer(gpus=1, max_epochs=epochs, deterministic=True, logger=logger) trainer.fit(model, train, test) return trainer
def main(): args = get_args() pl.seed_everything(args.seed, workers=True) # initialize data module data = DataModule(args) data.prepare_data() data.setup("fit") prior = None if args.prior_model: assert hasattr(priors, args.prior_model), ( f"Unknown prior model {args['prior_model']}. " f"Available models are {', '.join(priors.__all__)}" ) # initialize the prior model prior = getattr(priors, args.prior_model)(dataset=data.dataset) args.prior_args = prior.get_init_args() # initialize lightning module model = LNNP(args, prior_model=prior, mean=data.mean, std=data.std) checkpoint_callback = ModelCheckpoint( dirpath=args.log_dir, monitor="val_loss", save_top_k=10, # -1 to save all every_n_epochs=args.save_interval, filename="{epoch}-{val_loss:.4f}-{test_loss:.4f}", ) early_stopping = EarlyStopping("val_loss", patience=args.early_stopping_patience) tb_logger = pl.loggers.TensorBoardLogger( args.log_dir, name="tensorbord", version="", default_hp_metric=False ) csv_logger = CSVLogger(args.log_dir, name="", version="") trainer = pl.Trainer( strategy=DDPStrategy(find_unused_parameters=False), max_epochs=args.num_epochs, gpus=args.ngpus, num_nodes=args.num_nodes, default_root_dir=args.log_dir, auto_lr_find=False, resume_from_checkpoint=None if args.reset_trainer else args.load_model, callbacks=[early_stopping, checkpoint_callback], logger=[tb_logger, csv_logger], precision=args.precision, ) trainer.fit(model, data) # run test set after completing the fit model = LNNP.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) trainer = pl.Trainer(logger=[tb_logger, csv_logger]) trainer.test(model, data)
def test_file_logger_manual_versioning(tmpdir): """Verify that manual versioning works""" root_dir = tmpdir.mkdir("exp") root_dir.mkdir("version_0") root_dir.mkdir("version_1") root_dir.mkdir("version_2") logger = CSVLogger(save_dir=tmpdir, name="exp", version=1) assert logger.version == 1
def _add_default_loggers(self) -> List[LightningLoggerBase]: """Adds optional default loggers and returns the extended list Added loggers: CSV, TensorBoard, WandB """ loggers = self._trainer_config.logger if loggers is True: loggers = [] elif isinstance(loggers, LightningLoggerBase): loggers = [loggers] def get_loggers_of_type(logger_type) -> List[LightningLoggerBase]: return [logger for logger in loggers if isinstance(logger, logger_type)] # csv if self._trainer_config.add_csv_logger and not get_loggers_of_type(CSVLogger): loggers.append( CSVLogger( save_dir=self._trainer_config.default_root_dir or os.getcwd(), name="csv", ) ) # tensorboard if self._trainer_config.add_tensorboard_logger and not get_loggers_of_type( TensorBoardLogger ): loggers.append( TensorBoardLogger( save_dir=self._trainer_config.default_root_dir, name="tensorboard", ) ) # wandb if ( self._trainer_config.add_wandb_logger and _HAS_WANDB and not get_loggers_of_type(WandbLogger) ): self._wandb_logger = WandbLogger( save_dir=self._trainer_config.default_root_dir, project=os.environ.get("WANDB_PROJECT", "biome"), ) loggers.append(self._wandb_logger) elif get_loggers_of_type(WandbLogger): self._wandb_logger = get_loggers_of_type(WandbLogger)[0] # somehow the wandb dir does not get created, i think this is a bug on pl side, have to check it out if self._wandb_logger is not None and not os.path.isdir( os.path.join(self._wandb_logger.save_dir, "wandb") ): os.makedirs(os.path.join(self._wandb_logger.save_dir, "wandb")) return loggers
def test_fit_csv_logger(tmpdir): dm = ClassifDataModule() model = ClassificationModel() logger = CSVLogger(save_dir=tmpdir) trainer = Trainer(default_root_dir=tmpdir, max_steps=10, logger=logger, log_every_n_steps=1) trainer.fit(model, datamodule=dm) metrics_file = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) assert os.path.isfile(metrics_file)
def test_logdir_multiple_loggers(tmpdir): """Tests that the logdir equals the default_root_dir when trainer has multiple loggers.""" default_root_dir = tmpdir / "default_root_dir" save_dir = tmpdir / "save_dir" model = TestModel(default_root_dir) trainer = Trainer( default_root_dir=default_root_dir, max_steps=2, logger=[TensorBoardLogger(save_dir=save_dir, name="custom_logs"), CSVLogger(tmpdir)], ) assert trainer.log_dir == default_root_dir trainer.fit(model) assert trainer.log_dir == default_root_dir
def test_flush_n_steps(tmpdir): logger = CSVLogger(tmpdir, flush_logs_every_n_steps=2) metrics = { "float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1) } logger.save = MagicMock() logger.log_metrics(metrics, step=0) logger.save.assert_not_called() logger.log_metrics(metrics, step=1) logger.save.assert_called_once()
def train(cfg): """ Trains the classifier. """ if cfg.name == "auto": cfg.name = Haikunator().haikunate() train_csv = Path(cfg.dataroot) / cfg.train_csv logger.info(f"Starting run {cfg.name}") model = HiggsClassifier(hp=cfg.hparams.model) data = HiggsDataModule( trainfile=train_csv, trainset_prop=cfg.train_val_split_frac, hp=cfg.hparams.trainer, ) data.prepare() logger.info( f"Train set size: {data.trainsize}, Validation set size: {data.valsize}" ) os.makedirs(cfg.runroot, exist_ok=True) if cfg.logger == "wandb": ml_logger = WandbLogger( project="higgs", name=cfg.name, save_dir=cfg.runroot, log_model="all", id=cfg.name, ) ml_logger.watch(model, log="all") elif cfg.logger == "csv": ml_logger = CSVLogger(save_dir=cfg.runroot, name="higgs", version=cfg.name) checkpoint = ModelCheckpoint(monitor="val_loss", mode="min") start = datetime.now() trainer = Trainer( default_root_dir=cfg.runroot, max_epochs=cfg.hparams.trainer.n_epochs, logger=ml_logger, callbacks=[checkpoint], ) trainer.fit(model, data) end = datetime.now() logger.info(f"Took {end - start} to finish training.")
def main(seed=None) -> None: """ Entry-point for CLI tool """ args = _get_args() print(str(args).replace("Namespace", "Arguments used = ")) if seed is not None: seed_everything(seed) data = TreeDataModule( args.trees, batch_size=args.batch_size, split_part=args.split_part, split_seed=args.split_seed, ) kwargs = { "fp_size": args.fp_size, "lstm_size": args.lstm_size, "dropout_prob": args.dropout, "learning_rate": args.lr, "weight_decay": args.weight_decay, } model = RouteDistanceModel(**kwargs) gpus = int(torch.cuda.is_available()) tb_logger = TensorBoardLogger("tb_logs", name=f"route-dist") csv_logger = CSVLogger("csv_logs", name=f"route-dist") checkpoint = ModelCheckpoint(monitor="val_monitor", save_last=True) trainer = Trainer( gpus=gpus, logger=[tb_logger, csv_logger], callbacks=[checkpoint], max_epochs=args.epochs, deterministic=seed is not None, ) trainer.fit(model, datamodule=data) ret = trainer.test(datamodule=data) print("=== Test results === ") accum = accumulate_stats(ret) for key, value in accum.items(): print(f"{key}: {value:0.4f}")
def main(args): train_students = json.load( open(os.path.join(DATA_DIR, "train_students_ids.json"))) test_students = json.load( open(os.path.join(DATA_DIR, "test_students_ids.json"))) students_ids = train_students + test_students for student_id in students_ids: config, train_ds, val_ds = setup(args, student_id) train_dl = create_dataloader(train_ds, config.dataloader) val_dl = create_dataloader(val_ds, config.dataloader, shuffle=False) tags = tags_from_args(args) tags = [str(t) for t in tags] name = ':'.join(tags) config.pprint() AgentClass = globals()[config.agent] agent = AgentClass(config) save_dir = os.path.join(OUT_DIR, get_date()) if not os.path.isdir(save_dir): os.makedirs(save_dir) csv_logger = CSVLogger( save_dir=os.path.join(OUT_DIR, "leave_one_out_cv", student_id)) experiment = csv_logger.experiment experiment.log = experiment.log_metrics # create a function alias, so we don't have to change other things trainer = pl.Trainer( logger=csv_logger, weights_summary='full', max_epochs=config.epochs, gpus=[int(args.gpu_device)], ) trainer.fit( agent, train_dataloader=train_dl, val_dataloaders=val_dl, )
def test_pytorch_profiler_multiple_loggers(tmpdir): """Tests whether the PyTorch profiler is able to write its trace locally when the Trainer is configured with multiple loggers. See issue #8157. """ def look_for_trace(trace_dir): """Determines if a directory contains a PyTorch trace.""" return any("trace.json" in filename for filename in os.listdir(trace_dir)) # Sanity check assert not look_for_trace(tmpdir) model = BoringModel() loggers = [TensorBoardLogger(save_dir=tmpdir), CSVLogger(tmpdir)] trainer = Trainer(default_root_dir=tmpdir, profiler="pytorch", logger=loggers, limit_train_batches=5, max_epochs=1) assert len(trainer.loggers) == 2 trainer.fit(model) assert look_for_trace(tmpdir)
def test_file_logger_named_version(tmpdir): """Verify that manual versioning works for string versions, e.g. '2020-02-05-162402'""" exp_name = "exp" tmpdir.mkdir(exp_name) expected_version = "2020-02-05-162402" logger = CSVLogger(save_dir=tmpdir, name=exp_name, version=expected_version) logger.log_hyperparams({"a": 1, "b": 2}) logger.save() assert logger.version == expected_version assert os.listdir(tmpdir / exp_name) == [expected_version] assert os.listdir(tmpdir / exp_name / expected_version)
def test_logger_default_name(tmpdir): """Test that the default logger name is lightning_logs.""" # CSV logger = CSVLogger(save_dir=tmpdir) assert logger.name == "lightning_logs" # TensorBoard with mock.patch("pytorch_lightning.loggers.tensorboard.SummaryWriter"): logger = _instantiate_logger(TensorBoardLogger, save_dir=tmpdir) assert logger.name == "lightning_logs" # MLflow with mock.patch("pytorch_lightning.loggers.mlflow.mlflow"), mock.patch( "pytorch_lightning.loggers.mlflow.MlflowClient") as mlflow_client: mlflow_client().get_experiment_by_name.return_value = None logger = _instantiate_logger(MLFlowLogger, save_dir=tmpdir) _ = logger.experiment logger._mlflow_client.create_experiment.assert_called_with( name="lightning_logs", artifact_location=ANY)
def test_file_logger_log_metrics(tmpdir, step_idx): logger = CSVLogger(tmpdir) metrics = { "float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1) } logger.log_metrics(metrics, step_idx) logger.save() path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) with open(path_csv) as fp: lines = fp.readlines() assert len(lines) == 2 assert all(n in lines[0] for n in metrics)
def test_gpu_stats_monitor(tmpdir): """ Test GPU stats are logged using a logger. """ model = BoringModel() gpu_stats = GPUStatsMonitor(intra_step_time=True) logger = CSVLogger(tmpdir) log_every_n_steps = 2 trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=7, log_every_n_steps=log_every_n_steps, gpus=1, callbacks=[gpu_stats], logger=logger) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=',', names=True, deletechars='', replace_space=' ') batch_time_data = met_data['batch_time/intra_step (ms)'] batch_time_data = batch_time_data[~np.isnan(batch_time_data)] assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps fields = [ 'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory', ] for f in fields: assert any([f in h for h in met_data.dtype.names])