def test_master_success_influences_other_experiment_workers_status(self): with patch('scheduler.tasks.experiments.experiments_build.apply_async' ) as _: # noqa with patch.object(Experiment, 'set_status') as _: # noqa experiment = ExperimentFactory() assert ExperimentLifeCycle.is_done(experiment.last_status) is False # Add jobs master = ExperimentJobFactory(experiment=experiment, role=TaskType.MASTER) assert JobLifeCycle.is_done(master.last_status) is False workers = [ ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER) for _ in range(2) ] for worker in workers: worker.refresh_from_db() assert JobLifeCycle.is_done(worker.last_status) is False # Set master to succeeded ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED) # All worker should have a success status for worker in workers: worker.refresh_from_db() assert worker.last_status == JobLifeCycle.SUCCEEDED # Experiment last status should be success experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
def test_independent_experiment_creation_with_run_triggers_experiment_building_scheduling(self): config = ExperimentSpecification.read(exec_experiment_spec_content) # Create a repo for the project repo = RepoFactory() with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build: experiment = ExperimentFactory(config=config.parsed_data, project=repo.project) assert mock_build.call_count == 1 assert experiment.project.repo is not None assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED] with patch('dockerizer.builders.experiments.build_experiment') as mock_build: build_experiment(experiment_id=experiment.id) assert mock_build.call_count == 1 assert ExperimentStatus.objects.filter(experiment=experiment).count() == 4 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.BUILDING, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_create_experiment_with_valid_spec(self, spawner_mock): config = ExperimentSpecification.read(experiment_spec_content) mock_instance = spawner_mock.return_value mock_instance.start_experiment.return_value = start_experiment_value mock_instance.spec = config experiment = ExperimentFactory(config=config.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter( experiment=experiment).count() == 3 assert list( ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING ] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STARTING # Assert 1 job was created assert ExperimentJob.objects.filter(experiment=experiment).count() == 1 assert JobResources.objects.count() == 0 jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True) assert set(jobs_statuses) == { JobLifeCycle.CREATED, } jobs = ExperimentJob.objects.filter(experiment=experiment) assert experiment.calculated_status == ExperimentLifeCycle.STARTING for job in jobs: # Assert the jobs status is created assert job.last_status == JobLifeCycle.CREATED
def test_independent_experiment_creation_with_run_triggers_experiment_scheduling(self): config = ExperimentSpecification.read(exec_experiment_spec_content) # Create a repo for the project repo = RepoFactory() with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build: experiment = ExperimentFactory(content=config.raw_data, project=repo.project) assert mock_build.call_count == 1 assert experiment.project.repo is not None assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED] with patch('scheduler.dockerizer_scheduler.create_build_job') as mock_start: build = BuildJobFactory() BuildJobStatus.objects.create(status=JobLifeCycle.SUCCEEDED, job=build) mock_start.return_value = build, True, True experiments_build(experiment_id=experiment.id) assert mock_start.call_count == 1 assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_create_experiment_with_resources_spec(self, spawner_mock): config = ExperimentSpecification.read(exec_experiment_resources_content) mock_instance = spawner_mock.return_value mock_instance.start_experiment.return_value = start_experiment_value mock_instance.job_uuids = {'master': ['fa6203c189a855dd977019854a7ffcc3'], 'worker': ['3a9c9b0bd56b5e9fbdbd1a3d43d57960'], 'ps': ['59e3601232b85a3d8be2511f23a62945']} mock_instance.spec = config experiment = ExperimentFactory(content=config.raw_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.STARTING] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STARTING # Assert 3 jobs were created with resources assert ExperimentJob.objects.filter(experiment=experiment).count() == 3 assert JobResources.objects.count() == 3 jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True) assert set(jobs_statuses) == {JobLifeCycle.CREATED, } jobs = ExperimentJob.objects.filter(experiment=experiment) assert experiment.calculated_status == ExperimentLifeCycle.STARTING for job in jobs: # Assert the jobs status is created assert job.last_status == JobLifeCycle.CREATED
def test_resume(self): experiment = ExperimentFactory() count_experiment = Experiment.objects.count() ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STOPPED) assert experiment.last_status == ExperimentLifeCycle.STOPPED config = experiment.config declarations = experiment.declarations # Resume with same config experiment.resume() experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.RESUMING assert experiment.config == config assert experiment.declarations == declarations assert Experiment.objects.count() == count_experiment ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STOPPED) assert experiment.last_status == ExperimentLifeCycle.STOPPED # Resume with different config new_declarations = {'lr': 0.1, 'dropout': 0.5} experiment.resume(declarations=new_declarations) experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.RESUMING assert Experiment.objects.count() == count_experiment assert experiment.config == config assert experiment.declarations != declarations assert experiment.declarations == new_declarations
def test_independent_experiment_creation_triggers_experiment_scheduling(self): content = ExperimentSpecification.read(experiment_spec_content) experiment = ExperimentFactory(config=content.parsed_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_status_update_results_in_new_updated_at_datetime_experiment(self): experiment = ExperimentFactory() updated_at = experiment.updated_at # Create new status ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STARTING) experiment.refresh_from_db() assert updated_at < experiment.updated_at updated_at = experiment.updated_at # Create status Using set_status experiment.set_status(ExperimentLifeCycle.FAILED) experiment.refresh_from_db() assert updated_at < experiment.updated_at
def test_independent_experiment_creation_triggers_experiment_scheduling(self): content = ExperimentSpecification.read(exec_experiment_spec_content) with patch('scheduler.dockerizer_scheduler.create_build_job') as mock_start: mock_start.return_value = BuildJobFactory(), True, True experiment = ExperimentFactory(content=content.raw_data) assert experiment.is_independent is True assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3 assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list( 'status', flat=True)) == [ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED, ExperimentLifeCycle.FAILED] experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.FAILED
def test_experiments_check_heartbeat(self): experiment1 = ExperimentFactory() ExperimentStatusFactory(experiment=experiment1, status=ExperimentLifeCycle.RUNNING) RedisHeartBeat.experiment_ping(experiment_id=experiment1.id) experiment2 = ExperimentFactory() ExperimentStatusFactory(experiment=experiment2, status=ExperimentLifeCycle.RUNNING) experiments_check_heartbeat(experiment1.id) experiment1.refresh_from_db() self.assertEqual(experiment1.last_status, ExperimentLifeCycle.RUNNING) experiments_check_heartbeat(experiment2.id) experiment2.refresh_from_db() self.assertEqual(experiment2.last_status, ExperimentLifeCycle.FAILED)
def test_set_metrics(self): config = ExperimentSpecification.read(exec_experiment_spec_content) experiment = ExperimentFactory(content=config.raw_data) assert experiment.metrics.count() == 0 created_at = timezone.now() experiments_set_metrics(experiment_id=experiment.id, data={ 'created_at': created_at, 'values': { 'accuracy': 0.9, 'precision': 0.9 } }) assert experiment.metrics.count() == 1 experiment.refresh_from_db() assert experiment.last_metric == {'accuracy': 0.9, 'precision': 0.9} experiments_set_metrics(experiment_id=experiment.id, data=[{ 'created_at': created_at, 'values': { 'accuracy': 0.92, 'precision': 0.93, 'foo': 1 } }, { 'created_at': created_at, 'values': { 'accuracy': 0.95, 'precision': 0.96, 'bar': 8 } }]) assert experiment.metrics.count() == 3 experiment.refresh_from_db() assert experiment.last_metric == { 'accuracy': 0.95, 'precision': 0.96, 'bar': 8, 'foo': 1 }
def test_resume(self): experiment = ExperimentFactory() count_experiment = Experiment.objects.count() ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STOPPED) assert experiment.last_status == ExperimentLifeCycle.STOPPED config = experiment.config declarations = experiment.declarations # Resume with same config experiment.resume() experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations == declarations assert Experiment.objects.count() == count_experiment + 1 assert experiment.clones.count() == 1 # Resume with different config new_declarations = {'lr': 0.1, 'dropout': 0.5} new_experiment = experiment.resume(declarations=new_declarations) experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations != declarations assert last_resumed_experiment.declarations == new_declarations assert Experiment.objects.count() == count_experiment + 2 assert experiment.clones.count() == 2 # Resuming a resumed experiment new_experiment.resume() experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment_new = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment_new.original_experiment.pk != last_resumed_experiment.pk assert (last_resumed_experiment_new.original_experiment.pk == last_resumed_experiment.original_experiment.pk) assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations != declarations assert last_resumed_experiment.declarations == new_declarations assert Experiment.objects.count() == count_experiment + 3 assert experiment.clones.count() == 3 # Deleting a resumed experiment does not delete other experiments last_resumed_experiment_new.delete() assert experiment.clones.count() == 2 # Deleting original experiment deletes all experiment.delete() assert Experiment.objects.count() == 0
class TestBuildJobStatuses(BaseTest): def setUp(self): super().setUp() self.project = ProjectFactory() self.build_job = BuildJobFactory(project=self.project) self.notebook = NotebookJobFactory(project=self.project, build_job=self.build_job) self.tensorboard = TensorboardJobFactory(project=self.project, build_job=self.build_job) self.job = JobFactory(project=self.project, build_job=self.build_job) self.experiment = ExperimentFactory(project=self.project, build_job=self.build_job) def test_build_job_failed_sets_dependency_to_failed(self): assert self.build_job.last_status != JobLifeCycle.FAILED assert self.notebook.last_status != JobLifeCycle.FAILED assert self.tensorboard.last_status != JobLifeCycle.FAILED assert self.job.last_status != JobLifeCycle.FAILED assert self.experiment.last_status != ExperimentLifeCycle.FAILED self.build_job.set_status(JobLifeCycle.FAILED) assert self.build_job.last_status == JobLifeCycle.FAILED self.notebook.refresh_from_db() assert self.notebook.last_status == JobLifeCycle.FAILED self.tensorboard.refresh_from_db() assert self.tensorboard.last_status == JobLifeCycle.FAILED self.job.refresh_from_db() assert self.job.last_status == JobLifeCycle.FAILED self.experiment.refresh_from_db() assert self.experiment.last_status == ExperimentLifeCycle.FAILED def test_build_job_stopped_sets_dependency_to_stopped(self): assert self.build_job.last_status != JobLifeCycle.STOPPED assert self.notebook.last_status != JobLifeCycle.STOPPED assert self.tensorboard.last_status != JobLifeCycle.STOPPED assert self.job.last_status != JobLifeCycle.STOPPED assert self.experiment.last_status != ExperimentLifeCycle.STOPPED self.build_job.set_status(JobLifeCycle.STOPPED) assert self.build_job.last_status == JobLifeCycle.STOPPED self.notebook.refresh_from_db() assert self.notebook.last_status == JobLifeCycle.STOPPED self.tensorboard.refresh_from_db() assert self.tensorboard.last_status == JobLifeCycle.STOPPED self.job.refresh_from_db() assert self.job.last_status == JobLifeCycle.STOPPED self.experiment.refresh_from_db() assert self.experiment.last_status == ExperimentLifeCycle.STOPPED def test_build_job_succeeded_starts_dependency(self): assert self.build_job.last_status != JobLifeCycle.SUCCEEDED assert self.notebook.last_status != JobLifeCycle.SUCCEEDED assert self.tensorboard.last_status != JobLifeCycle.SUCCEEDED assert self.job.last_status != JobLifeCycle.SUCCEEDED assert self.experiment.last_status != ExperimentLifeCycle.SUCCEEDED with patch('scheduler.notebook_scheduler.start_notebook' ) as mock_notebook: with patch('scheduler.tensorboard_scheduler.start_tensorboard' ) as mock_tensorboard: with patch('scheduler.experiment_scheduler.start_experiment' ) as mock_experiment: with patch( 'scheduler.job_scheduler.start_job') as mock_job: self.build_job.set_status(JobLifeCycle.SUCCEEDED) assert self.build_job.last_status == JobLifeCycle.SUCCEEDED assert mock_notebook.call_count == 1 assert mock_tensorboard.call_count == 1 assert mock_experiment.call_count == 1 assert mock_job.call_count == 1
def test_resume(self): experiment = ExperimentFactory() count_experiment = Experiment.objects.count() ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STOPPED) assert experiment.last_status == ExperimentLifeCycle.STOPPED config = experiment.config declarations = experiment.declarations # Resume with same config experiment.resume() experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations == declarations assert Experiment.objects.count() == count_experiment + 1 assert experiment.clones.count() == 1 # Resume with different config new_declarations = { 'lr': 0.1, 'dropout': 0.5 } new_experiment = experiment.resume(declarations=new_declarations) experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations != declarations assert last_resumed_experiment.declarations == new_declarations assert Experiment.objects.count() == count_experiment + 2 assert experiment.clones.count() == 2 # Resuming a resumed experiment with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _: # noqa resumed = new_experiment.resume() ExperimentStatusFactory(experiment=resumed, status=ExperimentLifeCycle.CREATED) experiment.refresh_from_db() assert experiment.last_status == ExperimentLifeCycle.STOPPED last_resumed_experiment_new = experiment.clones.filter( cloning_strategy=CloningStrategy.RESUME).last() assert last_resumed_experiment_new.original_experiment.pk != last_resumed_experiment.pk assert (last_resumed_experiment_new.original_experiment.pk == last_resumed_experiment.original_experiment.pk) assert last_resumed_experiment.config == config assert last_resumed_experiment.declarations != declarations assert last_resumed_experiment.declarations == new_declarations assert Experiment.objects.count() == count_experiment + 3 assert experiment.clones.count() == 3 # Deleting a resumed experiment does not delete other experiments last_resumed_experiment_new.set_status(ExperimentLifeCycle.SCHEDULED) ExperimentJobFactory(experiment=last_resumed_experiment_new) with patch('scheduler.experiment_scheduler.stop_experiment') as mock_stop: last_resumed_experiment_new.delete() assert experiment.clones.count() == 2 assert mock_stop.call_count == 1 # Deleting original experiment deletes all with patch('scheduler.experiment_scheduler.stop_experiment') as mock_stop: experiment.delete() assert Experiment.objects.count() == 0 assert mock_stop.call_count == 0 # No running experiment