def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) assert mnist.metadata == {"testing": "override"} checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint) assert model_version == 1 assert mnist.get_version().uuid == checkpoint.uuid d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def main(experiment_id, master=None, input_path=None, cpu=False): # Download data print('downlading data from', input_path) path = input_path.split('s3://')[1] bucket = path.split('/')[0] key = '/'.join(path.split('/')[1:]) download_path = pathlib.Path('.') / 'test.jpg' s3.download_file(bucket, key, str(download_path)) # Retrieve model checkpoint print('fetching model from experiment', experiment_id) d = Determined(master=master) checkpoint = d.get_experiment(experiment_id).top_checkpoint() if cpu: print('loading cpu model') model = checkpoint.load(map_location=torch.device('cpu')) else: print('loading gpu model') model = checkpoint.load() # Make predictions print('making predictions') test_image = load_and_transform_image(download_path, cpu) with torch.no_grad(): outputs = model(test_image.unsqueeze(0))[0] # Upload results output_key = f'output/{time.time()}/output.png' print('uploading result to', f's3://{bucket}/{output_key}') boxes = filter_boxes(outputs['boxes'], outputs['scores']) output = draw_example(test_image.permute(1, 2, 0).cpu().numpy(), {'boxes': boxes}, 'output.png', title="Predictions") s3.upload_file(output, bucket, output_key)
def decide(detmaster: str, experiment_id: int, model_name: str) -> bool: # Submit determined experiment via CLI from determined.experimental import Determined import os os.environ['DET_MASTER'] = detmaster def get_validation_metric(checkpoint): metrics = checkpoint.validation['metrics'] config = checkpoint.experiment_config searcher = config['searcher'] smaller_is_better = bool(searcher['smaller_is_better']) metric_name = searcher['metric'] if 'validation_metrics' in metrics: metric = metrics['validation_metrics'][metric_name] else: metric = metrics['validationMetrics'][metric_name] return (metric, smaller_is_better) d = Determined() checkpoint = d.get_experiment(experiment_id).top_checkpoint() metric, smaller_is_better = get_validation_metric(checkpoint) models = d.get_models(name=model_name) model = None for m in models: if m.name == model_name: model = m break if not model: print(f'Registering new Model: {model_name}') model = Determined().create_model(model_name) model.register_version(checkpoint.uuid) better = True else: latest_version = model.get_version() if latest_version is None: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: old_metric, _ = get_validation_metric(latest_version) if smaller_is_better: if metric < old_metric: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: better = False else: if metric > old_metric: print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) better = True else: better = False if not better: print('Previous model version was better, logging...') return better
def register(detmaster: str, experiment_id: int, model_name: str) -> bool: # Submit determined experiment via CLI from determined.experimental import Determined import os os.environ['DET_MASTER'] = detmaster def get_validation_metric(checkpoint): metrics = checkpoint.validation['metrics'] config = checkpoint.experiment_config searcher = config['searcher'] smaller_is_better = bool(searcher['smaller_is_better']) metric_name = searcher['metric'] metric = metrics['validationMetrics'][metric_name] return (metric, smaller_is_better) def is_better(c1, c2): m1, smaller_is_better = get_validation_metric(c1) m2, _ = get_validation_metric(c2) if smaller_is_better and m1 < m2: return True return False d = Determined() checkpoint = d.get_experiment(experiment_id).top_checkpoint() try: model = d.get_model(model_name) except: # Model not yet in registry print(f'Registering new Model: {model_name}') model = d.create_model(model_name) print(f'Registering new version: {model_name}') model.register_version(checkpoint.uuid) return True
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) assert checkpoint.metadata == {"testing": "metadata"} checkpoint.add_metadata({"some_key": "some_value"}) assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"} checkpoint.add_metadata({"testing": "override"}) assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"} checkpoint.remove_metadata(["some_key"]) assert checkpoint.metadata == {"testing": "override"}
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.workloads) > 0 last_validation = exp.workloads_with_validation(trial.workloads)[-1] accuracy = last_validation.metrics["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=True ) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. assert all(c.training is not None for c in top_k) metrics = [ c.training.validation_metrics["avgMetrics"]["validation_loss"] for c in top_k if c.training is not None ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. # Beginning with 0.18 the system contributes a few items to the dict assert checkpoint.metadata.get("testing") == "metadata" assert checkpoint.metadata.keys() == {"format", "framework", "steps_completed", "testing"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "metadata", "some_key": "some_value"}.items() assert checkpoint.metadata.keys() == { "format", "framework", "steps_completed", "testing", "some_key", } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "override", "some_key": "some_value"}.items() assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert "some_key" not in checkpoint.metadata assert checkpoint.metadata["testing"] == "override" assert checkpoint.metadata == db_check.metadata
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = None objectdetect = None tform = None try: # Create a model and validate twiddling the metadata. mnist = d.create_model("mnist", "simple computer vision model", labels=["a", "b"]) assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) db_model = d.get_model(mnist.name) # Make sure the model metadata is correct and correctly saved to the db. assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata"} # Confirm we can look up a model by its ID db_model = d.get_model_by_id(mnist.model_id) assert db_model.name == "mnist" db_model = d.get_model(mnist.model_id) assert db_model.name == "mnist" # Confirm DB assigned username assert db_model.username == "determined" mnist.add_metadata({"some_key": "some_value"}) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == { "testing": "metadata", "some_key": "some_value" } mnist.add_metadata({"testing": "override"}) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == { "testing": "override", "some_key": "some_value" } mnist.remove_metadata(["some_key"]) db_model = d.get_model(mnist.name) assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override"} mnist.set_labels(["hello", "world"]) db_model = d.get_model(mnist.name) assert mnist.labels == db_model.labels assert db_model.labels == ["hello", "world"] # confirm patch does not overwrite other fields mnist.set_description("abcde") db_model = d.get_model(mnist.name) assert db_model.metadata == {"testing": "override"} assert db_model.labels == ["hello", "world"] # overwrite labels to empty list mnist.set_labels([]) db_model = d.get_model(mnist.name) assert db_model.labels == [] # archive and unarchive assert mnist.archived is False mnist.archive() db_model = d.get_model(mnist.name) assert db_model.archived is True mnist.unarchive() db_model = d.get_model(mnist.name) assert db_model.archived is False # Register a version for the model and validate the latest. checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 1 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.checkpoint.uuid == checkpoint.uuid latest_version.set_name("Test 2021") db_version = mnist.get_version() assert db_version is not None assert db_version.name == "Test 2021" latest_version.set_notes("# Hello Markdown") db_version = mnist.get_version() assert db_version is not None assert db_version.notes == "# Hello Markdown" # Run another basic test and register its checkpoint as a version as well. # Validate the latest has been updated. exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 2 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.checkpoint.uuid == checkpoint.uuid # Ensure the correct number of versions are present. all_versions = mnist.get_versions() assert len(all_versions) == 2 # Test deletion of model version latest_version.delete() all_versions = mnist.get_versions() assert len(all_versions) == 1 # Create some more models and validate listing models. tform = d.create_model("transformer", "all you need is attention") objectdetect = d.create_model("ac - Dc", "a test name model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["ac - Dc", "mnist", "transformer"] # Test model labels combined mnist.set_labels(["hello", "world"]) tform.set_labels(["world", "test", "zebra"]) labels = d.get_model_labels() assert labels == ["world", "hello", "test", "zebra"] # Test deletion of model tform.delete() tform = None models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["ac - Dc", "mnist"] finally: # Clean model registry of test models for model in [mnist, objectdetect, tform]: if model is not None: model.delete()
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) # Create a model and validate twiddling the metadata. mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) db_model = d.get_model("mnist") # Make sure the model metadata is correct and correctly saved to the db. assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) db_model = d.get_model("mnist") assert mnist.metadata == db_model.metadata assert mnist.metadata == {"testing": "override"} # Register a version for the model and validate the latest. checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 1 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.uuid == checkpoint.uuid # Run another basic test and register its checkpoint as a version as well. # Validate the latest has been updated. exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint.uuid) assert model_version.model_version == 2 latest_version = mnist.get_version() assert latest_version is not None assert latest_version.uuid == checkpoint.uuid # Ensure the correct number of versions are present. all_versions = mnist.get_versions() assert len(all_versions) == 2 # Create some more models and validate listing models. d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]