Пример #1
0
    def test_master_success_influences_other_experiment_workers_status(self):
        with patch('scheduler.tasks.experiments.experiments_build.apply_async'
                   ) as _:  # noqa
            with patch.object(Experiment, 'set_status') as _:  # noqa
                experiment = ExperimentFactory()

        assert ExperimentLifeCycle.is_done(experiment.last_status) is False
        # Add jobs
        master = ExperimentJobFactory(experiment=experiment,
                                      role=TaskType.MASTER)
        assert JobLifeCycle.is_done(master.last_status) is False
        workers = [
            ExperimentJobFactory(experiment=experiment, role=TaskType.WORKER)
            for _ in range(2)
        ]
        for worker in workers:
            worker.refresh_from_db()
            assert JobLifeCycle.is_done(worker.last_status) is False

        # Set master to succeeded
        ExperimentJobStatusFactory(job=master, status=JobLifeCycle.SUCCEEDED)

        # All worker should have a success status
        for worker in workers:
            worker.refresh_from_db()
            assert worker.last_status == JobLifeCycle.SUCCEEDED

        # Experiment last status should be success
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.SUCCEEDED
Пример #2
0
    def test_independent_experiment_creation_with_run_triggers_experiment_building_scheduling(self):
        config = ExperimentSpecification.read(exec_experiment_spec_content)
        # Create a repo for the project
        repo = RepoFactory()

        with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build:
            experiment = ExperimentFactory(config=config.parsed_data, project=repo.project)

        assert mock_build.call_count == 1
        assert experiment.project.repo is not None
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED]

        with patch('dockerizer.builders.experiments.build_experiment') as mock_build:
            build_experiment(experiment_id=experiment.id)

        assert mock_build.call_count == 1
        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 4
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED,
                                      ExperimentLifeCycle.BUILDING,
                                      ExperimentLifeCycle.SCHEDULED,
                                      ExperimentLifeCycle.FAILED]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.FAILED
Пример #3
0
    def test_create_experiment_with_valid_spec(self, spawner_mock):
        config = ExperimentSpecification.read(experiment_spec_content)

        mock_instance = spawner_mock.return_value
        mock_instance.start_experiment.return_value = start_experiment_value
        mock_instance.spec = config

        experiment = ExperimentFactory(config=config.parsed_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(
            experiment=experiment).count() == 3
        assert list(
            ExperimentStatus.objects.filter(experiment=experiment).values_list(
                'status', flat=True)) == [
                    ExperimentLifeCycle.CREATED, ExperimentLifeCycle.SCHEDULED,
                    ExperimentLifeCycle.STARTING
                ]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STARTING

        # Assert 1 job was created
        assert ExperimentJob.objects.filter(experiment=experiment).count() == 1
        assert JobResources.objects.count() == 0
        jobs_statuses = ExperimentJob.objects.values_list('statuses__status',
                                                          flat=True)
        assert set(jobs_statuses) == {
            JobLifeCycle.CREATED,
        }
        jobs = ExperimentJob.objects.filter(experiment=experiment)
        assert experiment.calculated_status == ExperimentLifeCycle.STARTING

        for job in jobs:
            # Assert the jobs status is created
            assert job.last_status == JobLifeCycle.CREATED
Пример #4
0
    def test_independent_experiment_creation_with_run_triggers_experiment_scheduling(self):
        config = ExperimentSpecification.read(exec_experiment_spec_content)
        # Create a repo for the project
        repo = RepoFactory()

        with patch('scheduler.tasks.experiments.experiments_build.apply_async') as mock_build:
            experiment = ExperimentFactory(content=config.raw_data, project=repo.project)

        assert mock_build.call_count == 1
        assert experiment.project.repo is not None
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 1
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED]

        with patch('scheduler.dockerizer_scheduler.create_build_job') as mock_start:
            build = BuildJobFactory()
            BuildJobStatus.objects.create(status=JobLifeCycle.SUCCEEDED, job=build)
            mock_start.return_value = build, True, True
            experiments_build(experiment_id=experiment.id)

        assert mock_start.call_count == 1
        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED,
                                      ExperimentLifeCycle.SCHEDULED,
                                      ExperimentLifeCycle.FAILED]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.FAILED
Пример #5
0
    def test_create_experiment_with_resources_spec(self, spawner_mock):
        config = ExperimentSpecification.read(exec_experiment_resources_content)
        mock_instance = spawner_mock.return_value
        mock_instance.start_experiment.return_value = start_experiment_value
        mock_instance.job_uuids = {'master': ['fa6203c189a855dd977019854a7ffcc3'],
                                   'worker': ['3a9c9b0bd56b5e9fbdbd1a3d43d57960'],
                                   'ps': ['59e3601232b85a3d8be2511f23a62945']}
        mock_instance.spec = config

        experiment = ExperimentFactory(content=config.raw_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED,
                                      ExperimentLifeCycle.SCHEDULED,
                                      ExperimentLifeCycle.STARTING]

        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STARTING

        # Assert 3 jobs were created with resources
        assert ExperimentJob.objects.filter(experiment=experiment).count() == 3
        assert JobResources.objects.count() == 3
        jobs_statuses = ExperimentJob.objects.values_list('statuses__status', flat=True)
        assert set(jobs_statuses) == {JobLifeCycle.CREATED, }
        jobs = ExperimentJob.objects.filter(experiment=experiment)
        assert experiment.calculated_status == ExperimentLifeCycle.STARTING

        for job in jobs:
            # Assert the jobs status is created
            assert job.last_status == JobLifeCycle.CREATED
Пример #6
0
    def test_resume(self):
        experiment = ExperimentFactory()
        count_experiment = Experiment.objects.count()
        ExperimentStatus.objects.create(experiment=experiment,
                                        status=ExperimentLifeCycle.STOPPED)
        assert experiment.last_status == ExperimentLifeCycle.STOPPED

        config = experiment.config
        declarations = experiment.declarations

        # Resume with same config
        experiment.resume()
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.RESUMING
        assert experiment.config == config
        assert experiment.declarations == declarations
        assert Experiment.objects.count() == count_experiment

        ExperimentStatus.objects.create(experiment=experiment,
                                        status=ExperimentLifeCycle.STOPPED)
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        # Resume with different config
        new_declarations = {'lr': 0.1, 'dropout': 0.5}
        experiment.resume(declarations=new_declarations)
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.RESUMING
        assert Experiment.objects.count() == count_experiment
        assert experiment.config == config
        assert experiment.declarations != declarations
        assert experiment.declarations == new_declarations
Пример #7
0
    def test_independent_experiment_creation_triggers_experiment_scheduling(self):
        content = ExperimentSpecification.read(experiment_spec_content)
        experiment = ExperimentFactory(config=content.parsed_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED,
                                      ExperimentLifeCycle.SCHEDULED,
                                      ExperimentLifeCycle.FAILED]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.FAILED
Пример #8
0
 def test_status_update_results_in_new_updated_at_datetime_experiment(self):
     experiment = ExperimentFactory()
     updated_at = experiment.updated_at
     # Create new status
     ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STARTING)
     experiment.refresh_from_db()
     assert updated_at < experiment.updated_at
     updated_at = experiment.updated_at
     # Create status Using set_status
     experiment.set_status(ExperimentLifeCycle.FAILED)
     experiment.refresh_from_db()
     assert updated_at < experiment.updated_at
Пример #9
0
    def test_independent_experiment_creation_triggers_experiment_scheduling(self):
        content = ExperimentSpecification.read(exec_experiment_spec_content)
        with patch('scheduler.dockerizer_scheduler.create_build_job') as mock_start:
            mock_start.return_value = BuildJobFactory(), True, True
            experiment = ExperimentFactory(content=content.raw_data)
        assert experiment.is_independent is True

        assert ExperimentStatus.objects.filter(experiment=experiment).count() == 3
        assert list(ExperimentStatus.objects.filter(experiment=experiment).values_list(
            'status', flat=True)) == [ExperimentLifeCycle.CREATED,
                                      ExperimentLifeCycle.SCHEDULED,
                                      ExperimentLifeCycle.FAILED]
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.FAILED
Пример #10
0
    def test_experiments_check_heartbeat(self):
        experiment1 = ExperimentFactory()
        ExperimentStatusFactory(experiment=experiment1,
                                status=ExperimentLifeCycle.RUNNING)
        RedisHeartBeat.experiment_ping(experiment_id=experiment1.id)
        experiment2 = ExperimentFactory()
        ExperimentStatusFactory(experiment=experiment2,
                                status=ExperimentLifeCycle.RUNNING)

        experiments_check_heartbeat(experiment1.id)
        experiment1.refresh_from_db()
        self.assertEqual(experiment1.last_status, ExperimentLifeCycle.RUNNING)

        experiments_check_heartbeat(experiment2.id)
        experiment2.refresh_from_db()
        self.assertEqual(experiment2.last_status, ExperimentLifeCycle.FAILED)
Пример #11
0
    def test_set_metrics(self):
        config = ExperimentSpecification.read(exec_experiment_spec_content)
        experiment = ExperimentFactory(content=config.raw_data)
        assert experiment.metrics.count() == 0

        created_at = timezone.now()
        experiments_set_metrics(experiment_id=experiment.id,
                                data={
                                    'created_at': created_at,
                                    'values': {
                                        'accuracy': 0.9,
                                        'precision': 0.9
                                    }
                                })

        assert experiment.metrics.count() == 1
        experiment.refresh_from_db()
        assert experiment.last_metric == {'accuracy': 0.9, 'precision': 0.9}

        experiments_set_metrics(experiment_id=experiment.id,
                                data=[{
                                    'created_at': created_at,
                                    'values': {
                                        'accuracy': 0.92,
                                        'precision': 0.93,
                                        'foo': 1
                                    }
                                }, {
                                    'created_at': created_at,
                                    'values': {
                                        'accuracy': 0.95,
                                        'precision': 0.96,
                                        'bar': 8
                                    }
                                }])

        assert experiment.metrics.count() == 3
        experiment.refresh_from_db()
        assert experiment.last_metric == {
            'accuracy': 0.95,
            'precision': 0.96,
            'bar': 8,
            'foo': 1
        }
Пример #12
0
    def test_resume(self):
        experiment = ExperimentFactory()
        count_experiment = Experiment.objects.count()
        ExperimentStatus.objects.create(experiment=experiment,
                                        status=ExperimentLifeCycle.STOPPED)
        assert experiment.last_status == ExperimentLifeCycle.STOPPED

        config = experiment.config
        declarations = experiment.declarations

        # Resume with same config
        experiment.resume()
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations == declarations
        assert Experiment.objects.count() == count_experiment + 1
        assert experiment.clones.count() == 1

        # Resume with different config
        new_declarations = {'lr': 0.1, 'dropout': 0.5}
        new_experiment = experiment.resume(declarations=new_declarations)
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations != declarations
        assert last_resumed_experiment.declarations == new_declarations
        assert Experiment.objects.count() == count_experiment + 2
        assert experiment.clones.count() == 2

        # Resuming a resumed experiment
        new_experiment.resume()
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment_new = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment_new.original_experiment.pk != last_resumed_experiment.pk
        assert (last_resumed_experiment_new.original_experiment.pk ==
                last_resumed_experiment.original_experiment.pk)
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations != declarations
        assert last_resumed_experiment.declarations == new_declarations
        assert Experiment.objects.count() == count_experiment + 3
        assert experiment.clones.count() == 3

        # Deleting a resumed experiment does not delete other experiments
        last_resumed_experiment_new.delete()
        assert experiment.clones.count() == 2

        # Deleting original experiment deletes all
        experiment.delete()
        assert Experiment.objects.count() == 0
Пример #13
0
class TestBuildJobStatuses(BaseTest):
    def setUp(self):
        super().setUp()
        self.project = ProjectFactory()
        self.build_job = BuildJobFactory(project=self.project)
        self.notebook = NotebookJobFactory(project=self.project,
                                           build_job=self.build_job)
        self.tensorboard = TensorboardJobFactory(project=self.project,
                                                 build_job=self.build_job)
        self.job = JobFactory(project=self.project, build_job=self.build_job)
        self.experiment = ExperimentFactory(project=self.project,
                                            build_job=self.build_job)

    def test_build_job_failed_sets_dependency_to_failed(self):
        assert self.build_job.last_status != JobLifeCycle.FAILED
        assert self.notebook.last_status != JobLifeCycle.FAILED
        assert self.tensorboard.last_status != JobLifeCycle.FAILED
        assert self.job.last_status != JobLifeCycle.FAILED
        assert self.experiment.last_status != ExperimentLifeCycle.FAILED

        self.build_job.set_status(JobLifeCycle.FAILED)

        assert self.build_job.last_status == JobLifeCycle.FAILED
        self.notebook.refresh_from_db()
        assert self.notebook.last_status == JobLifeCycle.FAILED
        self.tensorboard.refresh_from_db()
        assert self.tensorboard.last_status == JobLifeCycle.FAILED
        self.job.refresh_from_db()
        assert self.job.last_status == JobLifeCycle.FAILED
        self.experiment.refresh_from_db()
        assert self.experiment.last_status == ExperimentLifeCycle.FAILED

    def test_build_job_stopped_sets_dependency_to_stopped(self):
        assert self.build_job.last_status != JobLifeCycle.STOPPED
        assert self.notebook.last_status != JobLifeCycle.STOPPED
        assert self.tensorboard.last_status != JobLifeCycle.STOPPED
        assert self.job.last_status != JobLifeCycle.STOPPED
        assert self.experiment.last_status != ExperimentLifeCycle.STOPPED

        self.build_job.set_status(JobLifeCycle.STOPPED)

        assert self.build_job.last_status == JobLifeCycle.STOPPED
        self.notebook.refresh_from_db()
        assert self.notebook.last_status == JobLifeCycle.STOPPED
        self.tensorboard.refresh_from_db()
        assert self.tensorboard.last_status == JobLifeCycle.STOPPED
        self.job.refresh_from_db()
        assert self.job.last_status == JobLifeCycle.STOPPED
        self.experiment.refresh_from_db()
        assert self.experiment.last_status == ExperimentLifeCycle.STOPPED

    def test_build_job_succeeded_starts_dependency(self):
        assert self.build_job.last_status != JobLifeCycle.SUCCEEDED
        assert self.notebook.last_status != JobLifeCycle.SUCCEEDED
        assert self.tensorboard.last_status != JobLifeCycle.SUCCEEDED
        assert self.job.last_status != JobLifeCycle.SUCCEEDED
        assert self.experiment.last_status != ExperimentLifeCycle.SUCCEEDED

        with patch('scheduler.notebook_scheduler.start_notebook'
                   ) as mock_notebook:
            with patch('scheduler.tensorboard_scheduler.start_tensorboard'
                       ) as mock_tensorboard:
                with patch('scheduler.experiment_scheduler.start_experiment'
                           ) as mock_experiment:
                    with patch(
                            'scheduler.job_scheduler.start_job') as mock_job:
                        self.build_job.set_status(JobLifeCycle.SUCCEEDED)

        assert self.build_job.last_status == JobLifeCycle.SUCCEEDED
        assert mock_notebook.call_count == 1
        assert mock_tensorboard.call_count == 1
        assert mock_experiment.call_count == 1
        assert mock_job.call_count == 1
Пример #14
0
    def test_resume(self):
        experiment = ExperimentFactory()
        count_experiment = Experiment.objects.count()
        ExperimentStatus.objects.create(experiment=experiment, status=ExperimentLifeCycle.STOPPED)
        assert experiment.last_status == ExperimentLifeCycle.STOPPED

        config = experiment.config
        declarations = experiment.declarations

        # Resume with same config
        experiment.resume()
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations == declarations
        assert Experiment.objects.count() == count_experiment + 1
        assert experiment.clones.count() == 1

        # Resume with different config
        new_declarations = {
            'lr': 0.1,
            'dropout': 0.5
        }
        new_experiment = experiment.resume(declarations=new_declarations)
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations != declarations
        assert last_resumed_experiment.declarations == new_declarations
        assert Experiment.objects.count() == count_experiment + 2
        assert experiment.clones.count() == 2

        # Resuming a resumed experiment
        with patch('scheduler.tasks.experiments.experiments_build.apply_async') as _:  # noqa
            resumed = new_experiment.resume()
            ExperimentStatusFactory(experiment=resumed, status=ExperimentLifeCycle.CREATED)
        experiment.refresh_from_db()
        assert experiment.last_status == ExperimentLifeCycle.STOPPED
        last_resumed_experiment_new = experiment.clones.filter(
            cloning_strategy=CloningStrategy.RESUME).last()
        assert last_resumed_experiment_new.original_experiment.pk != last_resumed_experiment.pk
        assert (last_resumed_experiment_new.original_experiment.pk ==
                last_resumed_experiment.original_experiment.pk)
        assert last_resumed_experiment.config == config
        assert last_resumed_experiment.declarations != declarations
        assert last_resumed_experiment.declarations == new_declarations
        assert Experiment.objects.count() == count_experiment + 3
        assert experiment.clones.count() == 3

        # Deleting a resumed experiment does not delete other experiments
        last_resumed_experiment_new.set_status(ExperimentLifeCycle.SCHEDULED)
        ExperimentJobFactory(experiment=last_resumed_experiment_new)
        with patch('scheduler.experiment_scheduler.stop_experiment') as mock_stop:
            last_resumed_experiment_new.delete()
        assert experiment.clones.count() == 2
        assert mock_stop.call_count == 1

        # Deleting original experiment deletes all
        with patch('scheduler.experiment_scheduler.stop_experiment') as mock_stop:
            experiment.delete()
        assert Experiment.objects.count() == 0
        assert mock_stop.call_count == 0  # No running experiment