Пример #1
0
def screen_sessions(username: str,
                    hostname: str) -> Tuple[Content, HttpStatusCode]:
    """Returns pids of running `screen` sessions.

    This endpoint is for purely development purposes,
    currently there's no need to use it.
    """
    try:
        assert username and hostname, 'parameters must not be empty'
        pids = task_nursery.running(host=hostname, user=username)
    except AssertionError as e:
        content, status = {
            'msg': S['failure']['assertions'].format(reason=e)
        }, 422
    except (ConnectionErrorException, AuthenticationException,
            UnknownHostException) as e:
        content, status = {
            'msg':
            API.RESPONSES['ssh']['failure']['connection'].format(reason=e)
        }, 500
    except Exception as e:
        log.critical(e)
        content, status = {'msg': G['internal_error']}, 500
    else:
        # FIXME
        content, status = {'msg': S['success'], 'pids': pids}, 200
    finally:
        return content, status
Пример #2
0
def synchronize(task_id: TaskId) -> None:
    """Updates the state of a Task object stored in database.

    It compares current db record with list of active screen session (their pids in general)
    on node defined by that record ([email protected]).

    If task_nursery is unable to fetch active screen sessions then
    the new state is always set to unsynchronized.

    If task.pid is not alive (db record is outdated), then it
    makes transition from last known state to a new state:

    state before sync   => state applied after sync
    -----------------------------------------------
    running             => terminated
    unsynchronized      => not_running
    """
    log.debug('Syncing Task {}...'.format(task_id))
    try:
        task = Task.get(task_id)
        assert task.host, 'hostname is empty'
        assert task.user, 'user does not exist'
        active_sessions_pids = task_nursery.running(host=task.host,
                                                    user=task.user.username)
    except NoResultFound:
        # This exception must be handled within try/except block when using Task.get()
        # In other words, methods decorated with @synchronize_task_record must handle this case by themselves!
        log.warning(
            'Task {} could not be found (also synchronized). Failing without taking any action...'
            .format(task_id))
        pass
    except (AssertionError, Exception) as e:
        # task_nursery.running pssh exceptions are also catched here
        log.error('Unable to synchronize Task {}, reason: {}'.format(
            task_id, e))
        log.debug('Task {} status was: {}'.format(task_id, task.status.name))
        task.status = TaskStatus.unsynchronized
        task.save()
        log.debug('Task {} is now: {}'.format(task_id, task.status.name))
    else:
        log.debug('[BEFORE SYNC] Task {} status was: {}'.format(
            task_id, task.status.name))
        change_status_msg = '[AFTER SYNC] Task {id} is now: {curr_status}'
        if task.pid not in active_sessions_pids:
            if task.status is TaskStatus.running:
                task.status = TaskStatus.terminated
                log.debug(
                    change_status_msg.format(id=task_id,
                                             curr_status=task.status.name))
            if task.status is TaskStatus.unsynchronized:
                task.status = TaskStatus.not_running
                log.debug(
                    change_status_msg.format(id=task_id,
                                             curr_status=task.status.name))
            task.pid = None
            task.save()
Пример #3
0
    def sync_running_from_queue(
            self, available_hosts_with_gpu_occupation: Dict[str, Dict[str,
                                                                      List]]):
        jobs_running_from_queue = Job.get_jobs_running_from_queue()

        for job in jobs_running_from_queue:
            job_should_be_stopped = False
            for task in job.tasks:
                gpu_uid = Scheduler.get_assigned_gpu_uid(
                    task, available_hosts_with_gpu_occupation)

                if not gpu_uid or task.pid not in task_nursery.running(
                        task.hostname, job.user.username):
                    task.status = TaskStatus.not_running
                    continue

                current_processes_on_gpu = available_hosts_with_gpu_occupation[
                    task.hostname][gpu_uid]
                if current_processes_on_gpu is None:
                    other_process_pids = []
                else:
                    other_process_pids = [
                        process['pid'] for process in current_processes_on_gpu
                        if process['pid'] is not task.pid
                    ]

                considered_future_period = timedelta(
                    minutes=CONFIG.SCHEDULE_QUEUED_JOBS_WHEN_FREE_MINS)
                interferes = self.interferes_with_reservations(
                    job,
                    available_hosts_with_gpu_occupation,
                    considered_future_period=considered_future_period,
                    # Queued jobs should run only between reservations
                    allow_own=False)

                if len(other_process_pids) or interferes:
                    job_should_be_stopped = True

            if job_should_be_stopped:
                log.info(
                    self._log_msg(now=datetime.utcnow(),
                                  action='Stopping queued job',
                                  id=job.id))
                self.stop_with_grace(job.id)