Пример #1
0
def experiments_schedule_deletion(experiment_id, immediate=False):
    experiment = get_valid_experiment(experiment_id=experiment_id,
                                      include_deleted=True)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    experiment.archive()

    if experiment.is_stoppable:
        project = experiment.project
        workers.send(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                     kwargs={
                         'project_name': project.unique_name,
                         'project_uuid': project.uuid.hex,
                         'experiment_name': experiment.unique_name,
                         'experiment_uuid': experiment.uuid.hex,
                         'experiment_group_name': None,
                         'experiment_group_uuid': None,
                         'specification': experiment.content,
                         'update_status': True,
                         'collect_logs': False,
                         'message': 'Experiment is scheduled for deletion.',
                         'is_managed': experiment.is_managed,
                     })

    if immediate:
        workers.send(SchedulerCeleryTasks.DELETE_ARCHIVED_EXPERIMENT,
                     kwargs={
                         'experiment_id': experiment_id,
                     },
                     countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN_DELAYED))
Пример #2
0
def experiments_set_metrics(experiment_id, data):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    kwargs = {}
    is_list = isinstance(data, list)
    if is_list:
        kwargs['many'] = True
    serializer = ExperimentMetricSerializer(data=data, **kwargs)
    try:
        serializer.is_valid(raise_exception=True)
    except ValidationError:
        _logger.error(
            'Could not create metrics, a validation error was raised.')

    if is_list:
        merged_metrics = {}
        metrics_instances = []
        for metric_data in serializer.data:
            metrics_instances.append(
                ExperimentMetric(experiment=experiment, **metric_data))
            merged_metrics.update(metric_data['values'])
        ExperimentMetric.objects.bulk_create(metrics_instances)
        experiment.set_metric(merged_metrics)
    else:
        serializer.save(experiment=experiment)
Пример #3
0
def experiments_stop(project_name,
                     project_uuid,
                     experiment_name,
                     experiment_group_name,
                     experiment_group_uuid,
                     experiment_uuid,
                     specification,
                     update_status=True):
    specification = ExperimentSpecification.read(specification)
    experiment_scheduler.stop_experiment(
        project_name=project_name,
        project_uuid=project_uuid,
        experiment_name=experiment_name,
        experiment_group_name=experiment_group_name,
        experiment_group_uuid=experiment_group_uuid,
        experiment_uuid=experiment_uuid,
        specification=specification,
    )

    if not update_status:
        return

    experiment = get_valid_experiment(experiment_uuid=experiment_uuid)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_uuid)
        return

    # Update experiment status to show that its stopped
    experiment.set_status(ExperimentLifeCycle.STOPPED)
Пример #4
0
def experiments_schedule_deletion(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id,
                                      include_deleted=True)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    experiment.archive()

    if not experiment.is_running:
        return

    project = experiment.project
    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_STOP,
                         kwargs={
                             'project_name': project.unique_name,
                             'project_uuid': project.uuid.hex,
                             'experiment_name': experiment.unique_name,
                             'experiment_uuid': experiment.uuid.hex,
                             'experiment_group_name': None,
                             'experiment_group_uuid': None,
                             'specification': experiment.config,
                             'update_status': True,
                             'collect_logs': False,
                             'message': 'Experiment is scheduled for deletion.'
                         })
Пример #5
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                             kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(
            status_from=experiment.last_status,
            status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status,
                     ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(
            ExperimentLifeCycle.FAILED,
            message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.error(
            'Failed to build experiment, unexpected error occurred.\n%s',
            traceback.format_exc())
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    # Now we can start the experiment
    celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_START,
                         kwargs={'experiment_id': experiment_id})
Пример #6
0
def experiments_stop(experiment_id, update_status=True):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    experiment_scheduler.stop_experiment(experiment, update_status=update_status)
Пример #7
0
def experiments_set_metrics(experiment_uuid, metrics, created_at=None):
    experiment = get_valid_experiment(experiment_uuid=experiment_uuid)
    if not experiment:
        return

    kwargs = {}
    if created_at:
        kwargs['created_at'] = created_at
    ExperimentMetric.objects.create(experiment=experiment, values=metrics, **kwargs)
Пример #8
0
    def _run(task_bind, *args, **kwargs):
        experiment_id = kwargs['experiment_id']
        experiment = get_valid_experiment(experiment_id=experiment_id)
        if not experiment:
            raise OperationRunError(
                'The Experiment `{}` does not exist anymore.'.format(
                    experiment_id))

        celery_app.send_task(SchedulerCeleryTasks.EXPERIMENTS_BUILD,
                             kwargs={'experiment_id': experiment_id})
Пример #9
0
def experiments_stop(self,
                     project_name,
                     project_uuid,
                     experiment_name,
                     experiment_group_name,
                     experiment_group_uuid,
                     experiment_uuid,
                     specification,
                     update_status=True,
                     collect_logs=True,
                     is_managed=True,
                     message=None):
    if collect_logs and is_managed:
        try:
            collectors.logs_collect_experiment_jobs(
                experiment_uuid=experiment_uuid)
        except (OSError, StoreNotFoundError, PolyaxonStoresException):
            _logger.warning(
                'Scheduler could not collect '
                'the logs for experiment `%s`.', experiment_name)
    if specification and is_managed:
        specification = compiler.compile(kind=kinds.EXPERIMENT,
                                         values=specification)
        deleted = experiment_scheduler.stop_experiment(
            project_name=project_name,
            project_uuid=project_uuid,
            experiment_name=experiment_name,
            experiment_group_name=experiment_group_name,
            experiment_group_uuid=experiment_group_uuid,
            experiment_uuid=experiment_uuid,
            specification=specification,
        )
    else:
        deleted = True

    if not deleted and self.request.retries < 2:
        _logger.info('Trying again to delete job `%s` in experiment.',
                     experiment_name)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    if not update_status:
        return

    experiment = get_valid_experiment(experiment_uuid=experiment_uuid,
                                      include_deleted=True)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_uuid)
        return

    # Update experiment status to show that its stopped
    experiment.set_status(ExperimentLifeCycle.STOPPED,
                          message=message or 'Experiment was stopped')
Пример #10
0
def experiments_check_heartbeat(experiment_id):
    if RedisHeartBeat.experiment_is_alive(experiment_id=experiment_id):
        return

    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # Experiment is zombie status
    experiment.set_status(ExperimentLifeCycle.FAILED,
                          message='Experiment is in zombie state (no heartbeat was reported).')
Пример #11
0
def build_experiment(self, experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        if self.request.retries < 2:
            _logger.info('Trying again for Experiment `%s`.', experiment_id)
            self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)

        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return None

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)

    # Building the docker image
    try:
        status = experiments_builder.build_experiment(experiment)
    except DockerException as e:
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return
    except Repo.DoesNotExist:
        _logger.warning('No code was found for this project')
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='No code was found for to build this experiment.')
        return
    except Exception as e:  # Other exceptions
        _logger.warning('Failed to build experiment %s', e)
        experiment.set_status(ExperimentLifeCycle.FAILED,
                              message='Failed to build image for experiment.')
        return

    if not status:
        return

    # Now we can start the experiment
    celery_app.send_task(
        SchedulerCeleryTasks.EXPERIMENTS_START,
        kwargs={'experiment_id': experiment_id})
Пример #12
0
def experiments_start(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.SCHEDULED):
        _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.',
                     experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED)
        return None

    experiment_scheduler.start_experiment(experiment)
Пример #13
0
def experiments_set_metrics(experiment_id, data):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    kwargs = {}
    if isinstance(data, list):
        kwargs['many'] = True
    serializer = ExperimentMetricSerializer(data=data, **kwargs)
    try:
        serializer.is_valid(raise_exception=True)
    except ValidationError:
        _logger.error('Could not create metrics, a validation error was raised.')

    serializer.save(experiment=experiment)
Пример #14
0
def experiments_build(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id},
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN))
        return

    last_status = experiment.last_status
    if not ExperimentLifeCycle.can_transition(status_from=last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, last_status, ExperimentLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=experiment.user,
        project=experiment.project,
        config=experiment.specification.build,
        configmap_refs=experiment.specification.configmap_refs,
        secret_refs=experiment.specification.secret_refs,
        code_reference=experiment.code_reference)

    experiment.build_job = build_job
    experiment.save(update_fields=['build_job'])
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id},
            countdown=conf.get(SCHEDULER_GLOBAL_COUNTDOWN))
        return

    if not build_status:
        experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)
Пример #15
0
def experiments_stop(self,
                     project_name,
                     project_uuid,
                     experiment_name,
                     experiment_group_name,
                     experiment_group_uuid,
                     experiment_uuid,
                     specification,
                     update_status=True):
    if specification:
        specification = ExperimentSpecification.read(specification)
        deleted = experiment_scheduler.stop_experiment(
            project_name=project_name,
            project_uuid=project_uuid,
            experiment_name=experiment_name,
            experiment_group_name=experiment_group_name,
            experiment_group_uuid=experiment_group_uuid,
            experiment_uuid=experiment_uuid,
            specification=specification,
        )
    else:
        deleted = True

    if not deleted and self.request.retries < 2:
        _logger.info('Trying again to delete job `%s` in experiment.',
                     experiment_name)
        self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER)
        return

    if not update_status:
        return

    experiment = get_valid_experiment(experiment_uuid=experiment_uuid)
    if not experiment:
        _logger.info(
            'Something went wrong, '
            'the Experiment `%s` does not exist anymore.', experiment_uuid)
        return

    # Update experiment status to show that its stopped
    experiment.set_status(ExperimentLifeCycle.STOPPED,
                          message='Experiment was stopped')
Пример #16
0
def experiments_start(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        _logger.info('Something went wrong, '
                     'the Experiment `%s` does not exist anymore.', experiment_id)
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.SCHEDULED):
        _logger.info('Experiment `%s` cannot transition from `%s` to `%s`.',
                     experiment.unique_name, experiment.last_status, ExperimentLifeCycle.SCHEDULED)
        return None

    # Check if we need to copy an experiment
    if experiment.is_copy:
        copy_experiment(experiment)
    else:
        create_experiment_outputs_path(experiment.unique_name)

    experiment_scheduler.start_experiment(experiment)
Пример #17
0
def experiments_build(experiment_id):
    experiment = get_valid_experiment(experiment_id=experiment_id)
    if not experiment:
        return

    # No need to build the image, start the experiment directly
    if not (experiment.specification.build and experiment.specification.run):
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not ExperimentLifeCycle.can_transition(status_from=experiment.last_status,
                                              status_to=ExperimentLifeCycle.BUILDING):
        _logger.info('Experiment id `%s` cannot transition from `%s` to `%s`.',
                     experiment_id, experiment.last_status, ExperimentLifeCycle.BUILDING)
        return

    build_job, image_exists, build_status = dockerizer_scheduler.create_build_job(
        user=experiment.user,
        project=experiment.project,
        config=experiment.specification.build,
        code_reference=experiment.code_reference)

    experiment.build_job = build_job
    experiment.save()
    if image_exists:
        # The image already exists, so we can start the experiment right away
        celery_app.send_task(
            SchedulerCeleryTasks.EXPERIMENTS_START,
            kwargs={'experiment_id': experiment_id})
        return

    if not build_status:
        experiment.set_status(ExperimentLifeCycle.FAILED, message='Could not start build process.')
        return

    # Update experiment status to show that its building
    experiment.set_status(ExperimentLifeCycle.BUILDING)
Пример #18
0
def experiments_check_status(experiment_uuid=None, experiment_id=None):
    experiment = get_valid_experiment(experiment_id=experiment_id, experiment_uuid=experiment_uuid)
    if not experiment:
        return
    experiment.update_status()