示例#1
0
    def _tail_output(self):
        job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl,
                                      self.job['_id'])
        log = get_post_logger(self.job['_id'], self.girder_token, job_url)

        # Do we need to tail any output files
        for output in self.job.get('output', []):
            if 'tail' in output and output['tail']:
                path = output['path']
                offset = 0
                if 'content' in output:
                    offset = len(output['content'])
                else:
                    output['content'] = []
                tail_path = os.path.join(self.job['dir'], path)
                command = 'tail -n +%d %s' % (offset, tail_path)
                try:
                    # Only tail if file exists
                    if self.conn.isfile(tail_path):
                        stdout = self.conn.execute(command)
                        output['content'] = output['content'] + stdout
                    else:
                        log.info('Skipping tail of %s as file doesn\'t '
                                 'currently exist' % tail_path)
                except Exception as ex:
                    get_job_logger(self.job,
                                   self.girder_token).exception(str(ex))
示例#2
0
文件: job.py 项目: Kitware/cumulus
    def _tail_output(self):
        job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl,
                                      self.job['_id'])
        log = get_post_logger(self.job['_id'], self.girder_token, job_url)

        # Do we need to tail any output files
        for output in self.job.get('output', []):
            if 'tail' in output and output['tail']:
                path = output['path']
                offset = 0
                if 'content' in output:
                    offset = len(output['content'])
                else:
                    output['content'] = []
                tail_path = os.path.join(self.job['dir'], path)
                command = 'tail -n +%d %s' % (offset, tail_path)
                try:
                    # Only tail if file exists
                    if self.conn.isfile(tail_path):
                        stdout = self.conn.execute(command)
                        output['content'] = output['content'] + stdout
                    else:
                        log.info('Skipping tail of %s as file doesn\'t '
                                 'currently exist' %
                                 tail_path)
                except Exception as ex:
                    get_job_logger(self.job,
                                   self.girder_token).exception(ex.message)
示例#3
0
文件: job.py 项目: Kitware/cumulus
def upload_job_output(cluster, job, log_write_url=None, job_dir=None,
                      girder_token=None):

    job_name = job['name']
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    log.info('Uploading output for "%s"' % job_name)

    if parse('output.itemId').find(job):
        upload_job_output_to_item(cluster, job, log_write_url=log_write_url,
                                  job_dir=job_dir, girder_token=girder_token)
    else:
        upload_job_output_to_folder(cluster, job, log_write_url=log_write_url,
                                    job_dir=job_dir, girder_token=girder_token)
示例#4
0
文件: job.py 项目: psavery/cumulus
def upload_job_output(cluster, job, log_write_url=None, job_dir=None,
                      girder_token=None):

    job_name = job['name']
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    log.info('Uploading output for "%s"' % job_name)

    if parse('output.itemId').find(job):
        upload_job_output_to_item(cluster, job, log_write_url=log_write_url,
                                  job_dir=job_dir, girder_token=girder_token)
    else:
        upload_job_output_to_folder(cluster, job, log_write_url=log_write_url,
                                    job_dir=job_dir, girder_token=girder_token)
示例#5
0
文件: job.py 项目: Kitware/cumulus
def download_job_input(cluster, job, log_write_url=None, girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    # Create job directory
    with get_connection(girder_token, cluster) as conn:
        conn.mkdir(job_directory(cluster, job))

    log.info('Downloading input for "%s"' % job['name'])

    if parse('input.itemId').find(job):
        download_job_input_items(cluster, job, log_write_url=log_write_url,
                                 girder_token=girder_token)
    else:
        download_job_input_folders(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)
示例#6
0
文件: job.py 项目: psavery/cumulus
def download_job_input(cluster, job, log_write_url=None, girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    # Create job directory
    with get_connection(girder_token, cluster) as conn:
        conn.makedirs(job_directory(cluster, job))

    log.info('Downloading input for "%s"' % job['name'])

    if parse('input.itemId').find(job):
        download_job_input_items(cluster, job, log_write_url=log_write_url,
                                 girder_token=girder_token)
    else:
        download_job_input_folders(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)
示例#7
0
文件: job.py 项目: Kitware/cumulus
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
示例#8
0
文件: job.py 项目: psavery/cumulus
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
示例#9
0
文件: job.py 项目: Kitware/cumulus
    def next(self, job_queue_status):
        job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl,
                                      self.job['_id'])
        log = get_post_logger(self.job['_id'], self.girder_token, job_url)
        job_name = self.job['name']

        if 'runningTime' in self.job:
            running_time = time.time() - self.job['runningTime']
            self.job.get('timings', {})['running'] \
                = int(round(running_time * 1000))
            del self.job['runningTime']

        # Fire off task to upload the output
        log.info('Job "%s" complete' % job_name)

        if 'output' in self.job and len(self.job['output']) == 0:
            return Complete(self)

        return self
示例#10
0
文件: job.py 项目: psavery/cumulus
    def next(self, job_queue_status):
        job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl,
                                      self.job['_id'])
        log = get_post_logger(self.job['_id'], self.girder_token, job_url)
        job_name = self.job['name']

        if 'runningTime' in self.job:
            running_time = time.time() - self.job['runningTime']
            self.job.get('timings', {})['running'] \
                = int(round(running_time * 1000))
            del self.job['runningTime']

        # Fire off task to upload the output
        log.info('Job "%s" complete' % job_name)

        if 'output' in self.job and len(self.job['output']) == 0:
            return Complete(self)

        return self
示例#11
0
 def __init__(self):
     super(CallbackModule, self).__init__()
     self.current_task = None
     self.current_play = None
     self.logger = get_post_logger('cumulus_log', self.girder_token,
                                   self.log_write_url)
示例#12
0
文件: job.py 项目: Kitware/cumulus
def monitor_process(task, cluster, job, pid, nohup_out_path,
                    log_write_url=None, on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url, headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'], task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url, headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
示例#13
0
文件: job.py 项目: Kitware/cumulus
def submit_job(cluster, job, log_write_url=None, girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(get_queue_adapter(cluster, conn)
                                .number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.mkdir(job_dir, ignore_failure=True)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster, job, log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token':  girder_token}
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
示例#14
0
def monitor_process(task,
                    cluster,
                    job,
                    pid,
                    nohup_out_path,
                    log_write_url=None,
                    on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url,
                                               headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'],
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url,
                                       headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
示例#15
0
def submit_job(cluster,
               job,
               log_write_url=None,
               girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(
                        get_queue_adapter(cluster,
                                          conn).number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.makedirs(job_dir)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster,
                    job,
                    log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token': girder_token}
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
示例#16
0
 def __init__(self):
     super(CallbackModule, self).__init__()
     self.current_task = None
     self.current_play = None
     self.logger = get_post_logger('cumulus_log', self.girder_token,
                                   self.log_write_url)