def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation["metrics"]["validationMetrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. assert checkpoint.metadata == {"testing": "metadata"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == { "testing": "metadata", "some_key": "some_value" } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == { "testing": "override", "some_key": "some_value" } assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == {"testing": "override"} assert checkpoint.metadata == db_check.metadata
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.workloads) > 0 last_validation = exp.workloads_with_validation(trial.workloads)[-1] accuracy = last_validation.metrics["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=True ) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. assert all(c.training is not None for c in top_k) metrics = [ c.training.validation_metrics["avgMetrics"]["validation_loss"] for c in top_k if c.training is not None ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. # Beginning with 0.18 the system contributes a few items to the dict assert checkpoint.metadata.get("testing") == "metadata" assert checkpoint.metadata.keys() == {"format", "framework", "steps_completed", "testing"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "metadata", "some_key": "some_value"}.items() assert checkpoint.metadata.keys() == { "format", "framework", "steps_completed", "testing", "some_key", } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "override", "some_key": "some_value"}.items() assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert "some_key" not in checkpoint.metadata assert checkpoint.metadata["testing"] == "override" assert checkpoint.metadata == db_check.metadata