def test_pytorch_load(collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location="cpu")) trial_id = exp.experiment_trials(experiment_id)[0].trial.id collect_trial_profiles(trial_id)
def test_pytorch_gan_parallel() -> None: config = conf.load_config( conf.gan_examples_path("gan_mnist_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("gan_mnist_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0]["id"]).select_checkpoint(latest=True).load( map_location="cpu"))
def test_pytorch_cifar10_const() -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0].id).select_checkpoint(latest=True).load( map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def test_pytorch_load() -> None: config = conf.load_config(conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_pytorch"), 1 ) ( Determined(conf.make_master_url()) .get_experiment(experiment_id) .top_checkpoint() .load(map_location="cpu") )
def test_launch_layer_exit( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_entrypoint( config, "python3 -m nonexistent_launch_module model_def:CIFARTrial") experiment_id = exp.run_failure_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch")) trials = exp.experiment_trials(experiment_id) Determined(conf.make_master_url()).get_trial(trials[0].trial.id) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "container failed with non-zero exit code: 1")
def test_pytorch_cifar10_parallel( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id)
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) assert mnist.metadata == {"testing": "override"} checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 1 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.uuid == checkpoint.uuid d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "container exited successfully with a zero exit code")
def test_model_registry() -> None: d = Determined(conf.make_master_url()) mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) assert mnist.metadata == {"testing": "override"} d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation["metrics"]["validationMetrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. assert checkpoint.metadata == {"testing": "metadata"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == { "testing": "metadata", "some_key": "some_value" } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == { "testing": "override", "some_key": "some_value" } assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata == {"testing": "override"} assert checkpoint.metadata == db_check.metadata
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.workloads) > 0 last_validation = exp.workloads_with_validation(trial.workloads)[-1] accuracy = last_validation.metrics["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=True ) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. assert all(c.training is not None for c in top_k) metrics = [ c.training.validation_metrics["avgMetrics"]["validation_loss"] for c in top_k if c.training is not None ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. # Beginning with 0.18 the system contributes a few items to the dict assert checkpoint.metadata.get("testing") == "metadata" assert checkpoint.metadata.keys() == {"format", "framework", "steps_completed", "testing"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "metadata", "some_key": "some_value"}.items() assert checkpoint.metadata.keys() == { "format", "framework", "steps_completed", "testing", "some_key", } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "override", "some_key": "some_value"}.items() assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert "some_key" not in checkpoint.metadata assert checkpoint.metadata["testing"] == "override" assert checkpoint.metadata == db_check.metadata
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = None objectdetect = None tform = None try: # Create a model and validate twiddling the metadata. mnist = d.create_model("mnist", "simple computer vision model", labels=["a", "b"]) assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) db_model = d.get_model(mnist.name) # Make sure the model metadata is correct and correctly saved to the db. assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata"} # Confirm we can look up a model by its ID db_model = d.get_model_by_id(mnist.model_id) assert db_model.name == "mnist" db_model = d.get_model(mnist.model_id) assert db_model.name == "mnist" # Confirm DB assigned username assert db_model.username == "determined" mnist.add_metadata({"some_key": "some_value"}) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == { "testing": "metadata", "some_key": "some_value" } mnist.add_metadata({"testing": "override"}) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == { "testing": "override", "some_key": "some_value" } mnist.remove_metadata(["some_key"]) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override"} mnist.set_labels(["hello", "world"]) db_model = d.get_model(mnist.name) assert mnist.labels == db_model.labels assert db_model.labels == ["hello", "world"] # confirm patch does not overwrite other fields mnist.set_description("abcde") db_model = d.get_model(mnist.name) assert db_model.metadata == {"testing": "override"} assert db_model.labels == ["hello", "world"] # overwrite labels to empty list mnist.set_labels([]) db_model = d.get_model(mnist.name) assert db_model.labels == [] # archive and unarchive assert mnist.archived is False mnist.archive() db_model = d.get_model(mnist.name) assert db_model.archived is True mnist.unarchive() db_model = d.get_model(mnist.name) assert db_model.archived is False # Register a version for the model and validate the latest. checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 1 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.checkpoint.uuid == checkpoint.uuid latest_version.set_name("Test 2021") db_version = mnist.get_version() assert db_version is not None assert db_version.name == "Test 2021" latest_version.set_notes("# Hello Markdown") db_version = mnist.get_version() assert db_version is not None assert db_version.notes == "# Hello Markdown" # Run another basic test and register its checkpoint as a version as well. # Validate the latest has been updated. exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 2 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.checkpoint.uuid == checkpoint.uuid # Ensure the correct number of versions are present. all_versions = mnist.get_versions() assert len(all_versions) == 2 # Test deletion of model version latest_version.delete() all_versions = mnist.get_versions() assert len(all_versions) == 1 # Create some more models and validate listing models. tform = d.create_model("transformer", "all you need is attention") objectdetect = d.create_model("ac - Dc", "a test name model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["ac - Dc", "mnist", "transformer"] # Test model labels combined mnist.set_labels(["hello", "world"]) tform.set_labels(["world", "test", "zebra"]) labels = d.get_model_labels() assert labels == ["world", "hello", "test", "zebra"] # Test deletion of model tform.delete() tform = None models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["ac - Dc", "mnist"] finally: # Clean model registry of test models for model in [mnist, objectdetect, tform]: if model is not None: model.delete()
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) # Create a model and validate twiddling the metadata. mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) db_model = d.get_model("mnist") # Make sure the model metadata is correct and correctly saved to the db. assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override"} # Register a version for the model and validate the latest. checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 1 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.uuid == checkpoint.uuid # Run another basic test and register its checkpoint as a version as well. # Validate the latest has been updated. exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 2 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.uuid == checkpoint.uuid # Ensure the correct number of versions are present. all_versions = mnist.get_versions() assert len(all_versions) == 2 # Create some more models and validate listing models. d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def main(): parser = argparse.ArgumentParser(description='Run Determined Example') parser.add_argument('experiment_id', type=str, help='path to context directory') parser.add_argument('model_name', type=str, help='path to context directory') args = parser.parse_args() checkpoint = Determined().get_experiment( args.experiment_id).top_checkpoint() metric, smaller_is_better = get_validation_metric(checkpoint) models = Determined().get_models(name=args.model_name) model = None for m in models: if m.name == args.model_name: model = m break if not model: print(f'Registering new Model: {args.model_name}') model = Determined().create_model(args.model_name) model.register_version(checkpoint.uuid) better = True else: latest_version = model.get_version() if latest_version is None: print(f'Registering new version: {args.model_name}') model.register_version(checkpoint.uuid) better = True else: old_metric, _ = get_validation_metric(latest_version) if smaller_is_better: if metric < old_metric: print(f'Registering new version: {args.model_name}') model.register_version(checkpoint.uuid) better = True else: better = False else: if metric > old_metric: print(f'Registering new version: {args.model_name}') model.register_version(checkpoint.uuid) better = True else: better = False if not better: print('Previous model version was better, logging...') # Write experiment id to output file with open('/tmp/decision.txt', 'w') as f: if better: f.write('yes') else: f.write('no')
def decide(detmaster: str, experiment_id: int, model_name: str) -> bool: # Submit determined experiment via CLI from determined.experimental import Determined import os os.environ['DET_MASTER'] = detmaster def get_validation_metric(checkpoint): metrics = checkpoint.validation['metrics'] config = checkpoint.experiment_config searcher = config['searcher'] smaller_is_better = bool(searcher['smaller_is_better']) metric_name = searcher['metric'] if 'validation_metrics' in metrics: metric = metrics['validation_metrics'][metric_name] else: metric = metrics['validationMetrics'][metric_name] return (metric, smaller_is_better) d = Determined() checkpoint = d.get_experiment(experiment_id).top_checkpoint() metric, smaller_is_better = get_validation_metric(checkpoint) models = d.get_models(name=model_name) model = None for m in models: if m.name == model_name: model = m break if not model: print(f'Registering new Model: {model_name}') model = Determined().create_model(model_name) model.register_version(checkpoint.uuid) better = True else: latest_version = model.get_version() if latest_version is None: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: old_metric, _ = get_validation_metric(latest_version) if smaller_is_better: if metric < old_metric: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: better = False else: if metric > old_metric: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: better = False if not better: print('Previous model version was better, logging...') return better