def test_checkpoint_upload_failure(tmp_path: pathlib.Path) -> None: hparams = {"global_batch_size": 64} env = utils.make_default_env_context(hparams) rendezvous_info = utils.make_default_rendezvous_info() storage_manager = FailOnUploadStorageManager(str(tmp_path)) tensorboard_manager = NoopTensorboardManager() metric_writer = NoopBatchMetricWriter() def checkpoint_response_func(metrics: workload.Response) -> None: raise ValueError( "response_func should not be called if the upload fails") def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.checkpoint_workload(), [], checkpoint_response_func workload_manager = layers.build_workload_manager( env, make_workloads(), rendezvous_info, storage_manager, tensorboard_manager, metric_writer, ) trial_controller = NoopTrialController(iter(workload_manager)) # Iterate through the events in the workload_manager as the TrialController would. with pytest.raises(ValueError, match="upload error"): trial_controller.run()
def test_reject_nonscalar_searcher_metric() -> None: metric_name = "validation_error" hparams = {"global_batch_size": 64} experiment_config = utils.make_default_exp_config(hparams, 1) experiment_config["searcher"] = {"metric": metric_name} env = utils.make_default_env_context(hparams=hparams, experiment_config=experiment_config) rendezvous_info = utils.make_default_rendezvous_info() storage_manager = NoopStorageManager(os.devnull) tensorboard_manager = NoopTensorboardManager() metric_writer = NoopBatchMetricWriter() def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.validation_workload( ), [], workload.ignore_workload_response # Normal Python numbers and NumPy scalars are acceptable; other values are not. cases = [ (True, 17), (True, 0.17), (True, np.float64(0.17)), (True, np.float32(0.17)), (False, "foo"), (False, [0.17]), (False, {}), ] for is_valid, metric_value in cases: workload_manager = layers.build_workload_manager( env, make_workloads(), rendezvous_info, storage_manager, tensorboard_manager, metric_writer, ) trial_controller = NoopTrialController( iter(workload_manager), validation_metrics={metric_name: metric_value}) if is_valid: trial_controller.run() else: with pytest.raises(AssertionError, match="non-scalar"): trial_controller.run()
def test_subprocess_launcher_receiver() -> None: env = utils.make_default_env_context(hparams={"global_batch_size": 1}) rendezvous_info = utils.make_default_rendezvous_info() hvd_config = utils.make_default_hvd_config() def make_workloads() -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() for i, wkld in enumerate(fake_subprocess_receiver.fake_workload_gen()): yield from interceptor.send(wkld, []) assert interceptor.metrics_result() == {"count": i} subproc = layers.SubprocessLauncher( env=env, workloads=make_workloads(), load_path=None, rendezvous_info=rendezvous_info, hvd_config=hvd_config, python_subprocess_entrypoint="tests.fixtures.fake_subprocess_receiver", ) subproc.run()