示例#1
0
def test_connection(cluster, log_write_url=None, girder_token=None):
    cluster_id = cluster['_id']
    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                      cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token': girder_token}

    try:
        # First fetch the cluster with this 'admin' token so we get the
        # passphrase filled out.
        r = requests.get(cluster_url, headers=headers)
        check_status(r)
        cluster = r.json()

        with get_connection(girder_token, cluster) as conn:
            status = 'running'
            # Test can we can connect to cluster
            output = conn.execute('pwd')
        if len(output) < 1:
            log.error('Unable connect to cluster')
            status = 'error'

        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': status})
        check_status(r)
    except Exception as ex:
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        # Log the error message
        log.exception(ex)
示例#2
0
 def test_put_get(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.put(stream, self.test_file_path)
             with conn.get(self.test_file_path) as get_stream:
                 self.assertEqual(get_stream.read(), self.test_data)
示例#3
0
 def test_is_file(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
             with get_connection(self._girder_token, self._cluster) as conn:
                 conn.put(stream, self.test_file_path)
                 self.assertTrue(conn.isfile(self.test_file_path))
                 self.assertFalse(conn.isfile(self.test_case_dir))
示例#4
0
    def wrapped(event, **kwargs):
        if 'params' in event.info and key in event.info['params']:
            id = event.info['params'][key]
        elif key in event.info:
            id = event.info[key]
        else:
            # Request is not well formed, delegate to core.
            return

        cluster_id = None
        try:
            decoded_id = urllib.parse.unquote_plus(id)
            (cluster_id, path) = _parse_id(decoded_id)
            # If we have successfully decoded the id, then prevent the default
            event.preventDefault()

        except ValueError:
            pass

        if cluster_id is not None:
            cluster = Cluster().load(cluster_id, user=getCurrentUser())

            token = getCurrentToken()
            with get_connection(token['_id'], cluster) as conn:
                response = func(conn, path, cluster=cluster, encoded_id=id)

            event.addResponse(response)
示例#5
0
def download_path_from_cluster(cluster,
                               girder_token,
                               parent,
                               path,
                               upload=False,
                               include=None,
                               exclude=None):
    """
    Download a given path on a cluster into an assetstore.

    :params cluster: The cluster to to download the path from.
    :params girder_token: The Girder token to use to access Girder.
    :params parent: The target folder to import the path into.
    :params path: The path on the cluster to download.
    :params upload: Indicate if the import should upload the file data or just
                    the metadata, the default is False.
    :params include: List of include regexs
    :params exclude: List of exclude regexs,
    """
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)

    with get_connection(girder_token, cluster) as conn:
        download_path(conn,
                      girder_token,
                      parent,
                      path,
                      assetstore_base_url,
                      assetstore_id,
                      upload=upload,
                      include=include,
                      exclude=exclude)
示例#6
0
def _get_path(cluster, path):
    basename = os.path.basename(path)
    token = getCurrentToken()

    with get_connection(token['_id'], cluster) as conn:
        entry = conn.stat(path)

        entry_id = _generate_id(cluster['_id'], path)
        parent_id = _generate_id(cluster['_id'], os.path.dirname(path))
        model = {
            '_id': entry_id,
            'size': entry.st_size,
            'name': basename,
            'created': _mtime_isoformat(entry.st_mtime),
            'updated': _mtime_isoformat(entry.st_mtime)
        }
        if stat.S_ISDIR(entry.st_mode):
            model['_modelType'] = 'folder'
            model['description'] = ''
            model['parentCollection'] = 'folder'
            model['parentId'] = parent_id
            model['public'] = False

            return model
        elif stat.S_ISREG(entry.st_mode):
            model['_modelType'] = "file"
            model['assetstoreId'] = None
            model["exts"] = [os.path.splitext(basename)[1]]
            model['itemId'] = parent_id,
            model['mimeType'] = 'application/octet-stream'

            return model
示例#7
0
 def test_is_file(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.put(stream, self.test_file_path)
             self.assertTrue(conn.isfile(self.test_file_path))
             self.assertFalse(conn.isfile(self.test_case_dir))
示例#8
0
 def tearDown(self):
     try:
         with httmock.HTTMock(self.me):
             with get_connection(self._girder_token, self._cluster) as conn:
                 conn.execute('rm -rf %s' % self.test_case_dir)
     except Exception:
         pass
示例#9
0
 def test_put_get(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.put(stream, self.test_file_path)
             with conn.get(self.test_file_path) as get_stream:
                 self.assertEqual(get_stream.read(), self.test_data)
示例#10
0
 def tearDown(self):
     try:
         with httmock.HTTMock(self.me):
             with get_connection(self._girder_token, self._cluster) as conn:
                 conn.execute('rm -rf %s' % self.test_case_dir)
     except Exception:
         pass
示例#11
0
    def setUp(self):
        status_url = '%s/login' % newt_base_url

        data = {
            'username': NewtClusterConnectionTestCase.USER,
            'password': NewtClusterConnectionTestCase.PASSWORD
        }

        r = requests.post(status_url, data=data)
        json_resp = r.json()

        self.assertTrue(json_resp['auth'])
        self.session_id = json_resp['newt_sessionid']

        self._cluster = {'type': 'newt', 'config': {'host': 'cori'}}
        self._girder_token = 'dummy'

        def session_id(url, request):
            return self._session_id(url, request)

        url = '/api/v1/newt/sessionId'
        self.me = httmock.urlmatch(path=r'^%s$' % url,
                                   method='GET')(session_id)

        self.scratch_dir = '/global/cscratch1/sd/%s' % NewtClusterConnectionTestCase.USER
        self.test_data = 'nothing to see here!'
        self.test_case_dir = '%s/cumulus' % self.scratch_dir
        self.test_file_path = '%s/test.txt' % self.test_case_dir
        self.test_dir = '%s/cumulus' % self.test_case_dir

        # Create directory for test case
        with httmock.HTTMock(self.me):
            with get_connection(self._girder_token, self._cluster) as conn:
                conn.mkdir(self.test_case_dir)
示例#12
0
def test_connection(cluster, log_write_url=None, girder_token=None):
    cluster_id = cluster['_id']
    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id)
    log = get_cluster_logger(cluster, girder_token)
    headers = {'Girder-Token':  girder_token}

    try:
        # First fetch the cluster with this 'admin' token so we get the
        # passphrase filled out.
        r = requests.get(cluster_url, headers=headers)
        check_status(r)
        cluster = r.json()

        with get_connection(girder_token, cluster) as conn:
            status = 'running'
            # Test can we can connect to cluster
            output = conn.execute('pwd')
        if len(output) < 1:
            log.error('Unable connect to cluster')
            status = 'error'

        r = requests.patch(
            cluster_url, headers=headers, json={'status': status})
        check_status(r)
    except Exception as ex:
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        # Log the error message
        log.exception(ex)
示例#13
0
def create_paraview_job(task, *args, **kwargs):
    _update_cluster_config(task, kwargs['cluster'])
    task.logger.info('Validating args passed to flow.')
    validate_args(kwargs)
    cluster = kwargs.pop('cluster')

    # Save the cluster in the taskflow for termination
    task.taskflow.set_metadata('cluster', cluster)

    client = create_girder_client(task.taskflow.girder_api_url,
                                  task.taskflow.girder_token)

    task.taskflow.logger.info('Creating ParaView job.')
    task.logger.info('Load ParaView submission script.')

    base_path = os.path.dirname(__file__)
    script_path = os.path.join(base_path, 'pvw.sh')

    if not os.path.exists(script_path):
        msg = 'Script path %s does not exists.' % script_path
        task.logger.info(msg)
        raise Exception(msg)

    with open(script_path, 'r') as fp:
        commands = fp.read().splitlines()

    body = {
        'name': 'paraview',
        'commands': commands,
        'input': [],
        'output': []
    }

    job = client.post('jobs', data=json.dumps(body))
    task.logger.info('ParaView job created: %s' % job['_id'])
    task.taskflow.logger.info('ParaView job created.')

    task.taskflow.set_metadata('jobs', [job])

    # Upload the visualizer code
    task.logger.info('Uploading visualizer')
    viz_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../../../../../',
                     'node_modules/pvw-visualizer/server/pvw-visualizer.py'))

    if not os.path.exists(viz_path):
        task.logger.error(
            'Unable to locate pvw-visualizer.py for upload. (%s)' % viz_path)
        return

    target_dir = job_directory(cluster, job)
    target_path = os.path.join(target_dir, 'pvw-visualizer.py')

    with get_connection(task.taskflow.girder_token, cluster) as conn:
        conn.makedirs(target_dir)
        with open(viz_path, 'r') as fp:
            conn.put(fp, target_path)

    submit_paraview_job.delay(cluster, job, *args, **kwargs)
示例#14
0
文件: job.py 项目: psavery/cumulus
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
示例#15
0
文件: job.py 项目: Kitware/cumulus
def terminate_job(cluster, job, log_write_url=None, girder_token=None):
    script_filepath = None
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:

        with get_connection(girder_token, cluster) as conn:
            if AbstractQueueAdapter.QUEUE_JOB_ID in job:
                queue_adapter = get_queue_adapter(cluster, conn)
                output = queue_adapter.terminate_job(job)
            else:
                r = requests.patch(status_url, headers=headers,
                                   json={'status': JobState.TERMINATED})
                check_status(r)

            if 'onTerminate' in job:
                commands = '\n'.join(job['onTerminate']['commands']) + '\n'
                commands = Template(commands) \
                    .render(cluster=cluster,
                            job=job,
                            base_url=cumulus.config.girder.baseUrl)

                on_terminate = _put_script(conn, commands + '\n')

                terminate_output = '%s.terminate.out' % job_id
                terminate_cmd = 'nohup %s  &> %s  &\n' % (on_terminate,
                                                          terminate_output)
                terminate_cmd = _put_script(conn, terminate_cmd)
                output = conn.execute(terminate_cmd)

                conn.remove(on_terminate)
                conn.remove(terminate_cmd)

                if len(output) != 1:
                    raise Exception('PID not returned by execute command')

                try:
                    pid = int(output[0])
                except ValueError:
                    raise Exception('Unable to extract PID from: %s'
                                    % output)

                output_message = 'onTerminate error: %s'
                monitor_process.delay(cluster, job, pid, terminate_output,
                                      log_write_url=log_write_url,
                                      output_message=output_message,
                                      girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
    finally:
        if script_filepath and os.path.exists(script_filepath):
            os.remove(script_filepath)
示例#16
0
def remove_output(task, cluster, job, girder_token):
    try:
        with get_connection(girder_token, cluster) as conn:
            rm_cmd = 'rm -rf %s' % job['dir']
            conn.execute(rm_cmd)
    except EOFError:
        # Try again
        task.retry(countdown=5)
示例#17
0
文件: job.py 项目: Kitware/cumulus
def remove_output(task, cluster, job, girder_token):
    try:
        with get_connection(girder_token, cluster) as conn:
            rm_cmd = 'rm -rf %s' % job['dir']
            conn.execute(rm_cmd)
    except EOFError:
        # Try again
        task.retry(countdown=5)
示例#18
0
def create_paraview_job(task, *args, **kwargs):
    _update_cluster_config(task, kwargs['cluster'])
    task.logger.info('Validating args passed to flow.')
    validate_args(kwargs)
    cluster = kwargs.pop('cluster')

    # Save the cluster in the taskflow for termination
    task.taskflow.set_metadata('cluster', cluster)

    client = create_girder_client(
                task.taskflow.girder_api_url, task.taskflow.girder_token)

    task.taskflow.logger.info('Creating ParaView job.')
    task.logger.info('Load ParaView submission script.')

    base_path = os.path.dirname(__file__)
    script_path = os.path.join(base_path, 'pvw.sh')

    if not os.path.exists(script_path):
        msg = 'Script path %s does not exists.' % script_path
        task.logger.info(msg)
        raise Exception(msg)

    with open(script_path, 'r') as fp:
        commands = fp.read().splitlines()

    body = {
        'name': 'paraview',
        'commands': commands,
        'input': [],
        'output': []
    }

    job = client.post('jobs', data=json.dumps(body))
    task.logger.info('ParaView job created: %s' % job['_id'])
    task.taskflow.logger.info('ParaView job created.')

    task.taskflow.set_metadata('jobs', [job])

    # Upload the visualizer code
    task.logger.info('Uploading visualizer')
    viz_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../',  '../', '../','../',
            'node_modules/pvw-visualizer/server/pvw-visualizer.py'))

    if not os.path.exists(viz_path):
        task.logger.error('Unable to local pvw-visualizer.py for upload.')
        return

    target_dir = job_directory(cluster, job)
    target_path = os.path.join(target_dir, 'pvw-visualizer.py')

    with get_connection(task.taskflow.girder_token, cluster) as conn:
        conn.makedirs(target_dir)
        with open(viz_path, 'r') as fp:
            conn.put(fp, target_path)

    submit_paraview_job.delay(cluster, job,  *args, **kwargs)
示例#19
0
 def test_remove(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.put(stream, self.test_file_path)
             self.assertTrue(conn.isfile(self.test_file_path))
             conn.remove(self.test_file_path)
             with self.assertRaises(NewtException) as cm:
                 conn.stat(self.test_file_path)
示例#20
0
 def test_remove(self):
     stream = StringIO.StringIO(self.test_data)
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.put(stream, self.test_file_path)
             self.assertTrue(conn.isfile(self.test_file_path))
             conn.remove(self.test_file_path)
             with self.assertRaises(NewtException) as cm:
                 conn.stat(self.test_file_path)
示例#21
0
 def test_list(self):
     with get_connection(self._girder_token, self._cluster) as conn:
         for path in conn.list(self._test_case_dir):
             self.assertEqual(len(path.keys()), 6)
             self.assertTrue('name' in path)
             self.assertTrue('group' in path)
             self.assertTrue('user' in path)
             self.assertTrue('mode' in path)
             self.assertTrue('date' in path)
             self.assertTrue('size' in path)
示例#22
0
 def test_list(self):
     with get_connection(self._girder_token, self._cluster) as conn:
         for path in conn.list(self._test_case_dir):
             self.assertEqual(len(path.keys()), 6)
             self.assertTrue('name' in path)
             self.assertTrue('group' in path)
             self.assertTrue('user' in path)
             self.assertTrue('mode' in path)
             self.assertTrue('date' in path)
             self.assertTrue('size' in path)
示例#23
0
def create_openfoam_job(task, *args, **kwargs):
    # Girder client
    client = create_girder_client(
        task.taskflow.girder_api_url, task.taskflow.girder_token)

    # Save the cluster in the taskflow for termination
    cluster = kwargs.pop('cluster')
    task.taskflow.set_metadata('cluster', cluster)

    # Create job definition
    task.taskflow.logger.info('Creating OpenFoam job.')
    body = {
        'name': 'openfoam_run',
        'commands': [
            'python $PWD/simput-unpack.py $PWD/input-deck.json $PWD',
            'docker start of_v1612_plus',
            'docker exec -t of_v1612_plus $PWD/DockerRun $PWD'
        ],
        'input': [
            {
              'folderId': kwargs['input']['folder']['id'],
              'path': '.'
            },
            {
              'folderId': kwargs['input']['project']['folder']['id'],
              'path': '.'
            }
        ],
        'output': [
        ]
    }

    # Register job in girder + attach to taskflow
    job = client.post('jobs', data=json.dumps(body))
    task.logger.info('OpenFOAM job created: %s' % job['_id'])
    task.taskflow.logger.info('OpenFOAM job created.')
    task.taskflow.set_metadata('jobs', [job])

    # Capture job working directory
    target_dir = job_directory(cluster, job)
    task.taskflow.set_metadata('dataDir', target_dir)

    source_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), '../../../../../',
            'node_modules/simput/bin/unpack/simput-unpack.py'))
    target_path = os.path.join(target_dir, 'simput-unpack.py')

    # Upload unpack script
    with get_connection(task.taskflow.girder_token, cluster) as conn:
        conn.makedirs(target_dir)
        with open(source_path, 'r') as fp:
            conn.put(fp, target_path)

    # Move to the next task
    submit_open_foam_job.delay(cluster, job,  *args, **kwargs)
示例#24
0
文件: job.py 项目: psavery/cumulus
def download_job_input_items(cluster, job, log_write_url=None,
                             girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp, os.path.basename(path))

            r = requests.patch(status_url, json={'status': 'downloading'},
                               headers=headers)
            check_status(r)

            download_cmd = 'python girderclient.py --token %s --url "%s" ' \
                           'download --dir %s  --job %s' \
                % (girder_token, cumulus.config.girder.baseUrl,
                   job_directory(cluster, job), job_id)

            download_output = '%s.download.out' % job_id
            download_cmd = 'nohup %s  &> %s  &\n' % (download_cmd,
                                                     download_output)

            download_cmd = _put_script(conn, download_cmd)
            output = conn.execute(download_cmd)

            # Remove download script
            conn.remove(download_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        # When the download is complete submit the job
        on_complete = submit_job.s(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)

        monitor_process.delay(cluster, job, pid, download_output,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
示例#25
0
文件: job.py 项目: Kitware/cumulus
def download_job_input_items(cluster, job, log_write_url=None,
                             girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp, os.path.basename(path))

            r = requests.patch(status_url, json={'status': 'downloading'},
                               headers=headers)
            check_status(r)

            download_cmd = 'python girderclient.py --token %s --url "%s" ' \
                           'download --dir %s  --job %s' \
                % (girder_token, cumulus.config.girder.baseUrl,
                   job_directory(cluster, job), job_id)

            download_output = '%s.download.out' % job_id
            download_cmd = 'nohup %s  &> %s  &\n' % (download_cmd,
                                                     download_output)

            download_cmd = _put_script(conn, download_cmd)
            output = conn.execute(download_cmd)

            # Remove download script
            conn.remove(download_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        # When the download is complete submit the job
        on_complete = submit_job.s(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)

        monitor_process.delay(cluster, job, pid, download_output,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
示例#26
0
def upload_file(cluster, girder_token, file, path):
    """
    Upload a file to a cluster

    :param cluster: The cluster to upload to.
    :param girder_tokebn: The Grider token for Girder access.
    :param file: The Girder file object.
    :param path: The path on the cluster to upload to.
    """
    girder_client = GirderClient(apiUrl=cumulus.config.girder.baseUrl)
    girder_client.token = girder_token
    with get_connection(girder_token, cluster) as conn:
        conn.makedirs(os.path.dirname(path))
        _upload_file(conn, girder_client, file, path)
示例#27
0
文件: upload.py 项目: Kitware/cumulus
def upload_file(cluster, girder_token, file, path):
    """
    Upload a file to a cluster

    :param cluster: The cluster to upload to.
    :param girder_tokebn: The Grider token for Girder access.
    :param file: The Girder file object.
    :param path: The path on the cluster to upload to.
    """
    girder_client = GirderClient(apiUrl=cumulus.config.girder.baseUrl)
    girder_client.token = girder_token
    with get_connection(girder_token, cluster) as conn:
        conn.makedirs(os.path.dirname(path))
        _upload_file(conn, girder_client, file, path)
示例#28
0
文件: job.py 项目: Kitware/cumulus
def download_job_input_folders(cluster, job, log_write_url=None,
                               girder_token=None, submit=True):
    job_dir = job_directory(cluster, job)

    with get_connection(girder_token, cluster) as conn:
        for input in job['input']:
            if 'folderId' in input and 'path' in input:
                folder_id = input['folderId']
                path = input['path']
                upload_path(conn, girder_token, folder_id,
                            os.path.join(job_dir, path))

    if submit:
        submit_job.delay(cluster, job, log_write_url=log_write_url,
                         girder_token=girder_token)
示例#29
0
文件: job.py 项目: psavery/cumulus
def download_job_input_folders(cluster, job, log_write_url=None,
                               girder_token=None, submit=True):
    job_dir = job_directory(cluster, job)

    with get_connection(girder_token, cluster) as conn:
        for input in job['input']:
            if 'folderId' in input and 'path' in input:
                folder_id = input['folderId']
                path = input['path']
                upload_path(conn, girder_token, folder_id,
                            os.path.join(job_dir, path))

    if submit:
        submit_job.delay(cluster, job, log_write_url=log_write_url,
                         girder_token=girder_token)
示例#30
0
文件: job.py 项目: Kitware/cumulus
def download_job_input(cluster, job, log_write_url=None, girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    # Create job directory
    with get_connection(girder_token, cluster) as conn:
        conn.mkdir(job_directory(cluster, job))

    log.info('Downloading input for "%s"' % job['name'])

    if parse('input.itemId').find(job):
        download_job_input_items(cluster, job, log_write_url=log_write_url,
                                 girder_token=girder_token)
    else:
        download_job_input_folders(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)
示例#31
0
    def test_get_ssh_connection(self, connect, from_private_key_file):
        cluster = {
            '_id': self._cluster_id,
            'config': {
                'ssh': {
                    'user': '******',
                    'key': self._cluster_id,
                    'passphrase': 'test'
                },
                'host': 'localhost'
            },
            'type': 'trad'
        }

        with get_connection('girder_token', cluster) as ssh:
            self.assertTrue(isinstance(ssh, SshClusterConnection))
示例#32
0
文件: job.py 项目: psavery/cumulus
def download_job_input(cluster, job, log_write_url=None, girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)

    # Create job directory
    with get_connection(girder_token, cluster) as conn:
        conn.makedirs(job_directory(cluster, job))

    log.info('Downloading input for "%s"' % job['name'])

    if parse('input.itemId').find(job):
        download_job_input_items(cluster, job, log_write_url=log_write_url,
                                 girder_token=girder_token)
    else:
        download_job_input_folders(cluster, job, log_write_url=log_write_url,
                                   girder_token=girder_token)
示例#33
0
    def test_get_ssh_connection(self, connect, from_private_key_file):
        cluster = {
            '_id': self._cluster_id,
            'config': {
                'ssh': {
                    'user': '******',
                    'key': self._cluster_id,
                    'passphrase': 'test'
                },
                'host': 'localhost'
            },
            'type': 'trad'
        }

        with get_connection('girder_token', cluster) as ssh:
            self.assertTrue(isinstance(ssh, SshClusterConnection))
示例#34
0
文件: job.py 项目: Kitware/cumulus
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
示例#35
0
文件: job.py 项目: psavery/cumulus
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None,
                                girder_token=None):
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id'])
    headers = {'Girder-Token':  girder_token}
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)
    if not job_dir:
        job_dir = job['dir']

    try:
        with get_connection(girder_token, cluster) as conn:
            for output in job['output']:
                if 'folderId' in output and 'path' in output:
                    folder_id = output['folderId']
                    path = os.path.join(job_dir, output['path'])
                    download_path(conn, girder_token, folder_id, path,
                                  assetstore_base_url, assetstore_id)
    except HttpError as e:
        job['status'] = JobState.ERROR
        url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
        logger = get_post_logger('job', girder_token, url)
        logger.exception(e.responseText)
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.ERROR})
        check_status(r)

    if _get_on_complete(job) == 'terminate':
        cluster_log_url = '%s/clusters/%s/log' % \
            (cumulus.config.girder.baseUrl, cluster['_id'])
        command.send_task(
            'cumulus.tasks.cluster.terminate_cluster',
            args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                     'girder_token': girder_token})

    # If we where uploading move job to the complete state
    if job['status'] == JobState.UPLOADING:
        job_status = from_string(job['status'], task=None,
                                 cluster=cluster, job=job,
                                 log_write_url=log_write_url,
                                 girder_token=girder_token,
                                 conn=conn)
        job_status = Complete(job_status)
        job_status = job_status.next(JobQueueState.COMPLETE)
        job_status.run()
        r = requests.patch(status_url, headers=headers,
                           json={'status': str(job_status)})
        check_status(r)
示例#36
0
def download_path_from_cluster(cluster, girder_token, parent, path,
                               upload=False, include=None, exclude=None):
    """
    Download a given path on a cluster into an assetstore.

    :params cluster: The cluster to to download the path from.
    :params girder_token: The Girder token to use to access Girder.
    :params parent: The target folder to import the path into.
    :params path: The path on the cluster to download.
    :params upload: Indicate if the import should upload the file data or just
                    the metadata, the default is False.
    :params include: List of include regexs
    :params exclude: List of exclude regexs,
    """
    assetstore_base_url = get_assetstore_url_base(cluster)
    assetstore_id = get_assetstore_id(girder_token, cluster)

    with get_connection(girder_token, cluster) as conn:
        download_path(conn, girder_token, parent, path, assetstore_base_url,
                      assetstore_id, upload=upload, include=include,
                      exclude=exclude)
示例#37
0
    def setUp(self):
        status_url = '%s/login' % newt_base_url

        data = {
            'username': NewtClusterConnectionTestCase.USER,
            'password': NewtClusterConnectionTestCase.PASSWORD
        }

        r = requests.post(status_url, data=data)
        json_resp = r.json()

        self.assertTrue(json_resp['auth'])
        self.session_id = json_resp['newt_sessionid']

        self._cluster = {
            'type': 'newt',
            'config': {
                'host': 'cori'
            }
        }
        self._girder_token = 'dummy'

        def session_id(url, request):
            return self._session_id(url, request)

        url = '/api/v1/newt/sessionId'
        self.me = httmock.urlmatch(
            path=r'^%s$' % url, method='GET')(session_id)

        self.scratch_dir = '/global/cscratch1/sd/%s' % NewtClusterConnectionTestCase.USER
        self.test_data = 'nothing to see here!'
        self.test_case_dir = '%s/cumulus' % self.scratch_dir
        self.test_file_path = '%s/test.txt' % self.test_case_dir
        self.test_dir = '%s/cumulus' % self.test_case_dir

        # Create directory for test case
        with httmock.HTTMock(self.me):
            with get_connection(self._girder_token, self._cluster) as conn:
                conn.mkdir(self.test_case_dir)
示例#38
0
 def test_execute(self):
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             self.assertEqual(conn.execute('ls /bin/ls'), '/bin/ls')
示例#39
0
 def setUp(self):
     # Create directory for test case
     with get_connection(self._girder_token, self._cluster) as conn:
         conn.mkdir(self._test_case_dir)
         conn.put(StringIO.StringIO(), '%s/test.txt' % self._test_case_dir)
示例#40
0
 def test_mkdir(self):
     with httmock.HTTMock(self.me):
             with get_connection(self._girder_token, self._cluster) as conn:
                 conn.mkdir(self.test_dir)
                 self.assertFalse(conn.isfile(self.test_dir))
示例#41
0
 def test_stat(self):
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.stat(self.test_case_dir)
示例#42
0
 def setUp(self):
     # Create directory for test case
     with get_connection(self._girder_token, self._cluster) as conn:
         conn.mkdir(self._test_case_dir)
         conn.put(StringIO.StringIO(), '%s/test.txt' % self._test_case_dir)
示例#43
0
 def tearDown(self):
     try:
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.execute('rm -rf %s' % self._test_case_dir)
     except Exception:
         raise
示例#44
0
 def test_perms_to_mode(self):
     test_perms = 'drwxr-xr-x'
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             self.assertEqual(conn._perms_to_mode(test_perms), 16877)
示例#45
0
def upload_job_output_to_item(cluster,
                              job,
                              log_write_url=None,
                              job_dir=None,
                              girder_token=None):
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(
                    fp,
                    os.path.normpath(
                        os.path.join(job_dir, '..', os.path.basename(path))))

            cmds = ['cd %s' % job_dir]
            upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \
                         'upload --job %s' \
                         % (girder_token,
                            cumulus.config.girder.baseUrl, job['_id'])

            upload_output = '%s.upload.out' % job_id
            upload_output_path = os.path.normpath(
                os.path.join(job_dir, '..', upload_output))
            cmds.append('nohup %s  &> ../%s  &\n' %
                        (upload_cmd, upload_output))

            upload_cmd = _put_script(conn, '\n'.join(cmds))
            output = conn.execute(upload_cmd)

            # Remove upload script
            conn.remove(upload_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        on_complete = None

        if _get_on_complete(job) == 'terminate':
            cluster_log_url = '%s/clusters/%s/log' % \
                (cumulus.config.girder.baseUrl, cluster['_id'])
            on_complete = signature('cumulus.tasks.cluster.terminate_cluster',
                                    args=(cluster, ),
                                    kwargs={
                                        'log_write_url': cluster_log_url,
                                        'girder_token': girder_token
                                    })

        monitor_process.delay(cluster,
                              job,
                              pid,
                              upload_output_path,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
示例#46
0
def submit_job(cluster,
               job,
               log_write_url=None,
               girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(
                        get_queue_adapter(cluster,
                                          conn).number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.makedirs(job_dir)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster,
                    job,
                    log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token': girder_token}
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
示例#47
0
文件: job.py 项目: Kitware/cumulus
def monitor_process(task, cluster, job, pid, nohup_out_path,
                    log_write_url=None, on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url, headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'], task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url, headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
示例#48
0
def create_paraview_job(task, *args, **kwargs):
    _update_cluster_config(task, kwargs['cluster'])
    task.logger.info('Validating args passed to flow.')
    validate_args(kwargs)
    cluster = kwargs.pop('cluster')

    # Save the cluster in the taskflow for termination
    task.taskflow.set_metadata('cluster', cluster)

    client = create_girder_client(task.taskflow.girder_api_url,
                                  task.taskflow.girder_token)

    task.taskflow.logger.info('Creating ParaView job.')
    task.logger.info('Load ParaView submission script.')

    base_path = os.path.dirname(__file__)
    script_path = os.path.join(base_path, 'pvw.sh')

    if not os.path.exists(script_path):
        msg = 'Script path %s does not exists.' % script_path
        task.logger.info(msg)
        raise Exception(msg)

    with open(script_path, 'r') as fp:
        commands = fp.read().splitlines()

    body = {
        'name': 'paraview',
        'commands': commands,
        'input': [],
        'output': []
    }

    job = client.post('jobs', data=json.dumps(body))
    task.logger.info('ParaView job created: %s' % job['_id'])
    task.taskflow.logger.info('ParaView job created.')

    task.taskflow.set_metadata('jobs', [job])

    # Upload the visualizer code
    task.logger.info('Uploading visualization application')
    target_dir = job_directory(cluster, job)

    # Gather files to copy
    filesToCopy = []
    for localFile in LOCAL_FILES:
        srcFile = os.path.abspath(os.path.join(LOCAL_DIRECTORY, localFile))
        dstFile = os.path.join(target_dir, localFile)

        if not os.path.exists(srcFile):
            task.logger.error('Unable to locate file for upload. (%s)' %
                              srcFile)
            return

        filesToCopy.append((srcFile, dstFile))

    # Copy the files to the server
    with get_connection(task.taskflow.girder_token, cluster) as conn:
        conn.makedirs(target_dir)
        for dstDir in DESTINATION_DIRECTORIES:
            conn.makedirs(os.path.join(target_dir, dstDir))

        for srcDst in filesToCopy:
            with open(srcDst[0], 'r') as fp:
                conn.put(fp, srcDst[1])

    submit_paraview_job.delay(cluster, job, *args, **kwargs)
示例#49
0
def _monitor_jobs(task,
                  cluster,
                  jobs,
                  log_write_url=None,
                  girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token': girder_token}

    cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl,
                                      cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status,
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set([
                    JobState.CREATED, JobState.QUEUED, JobState.RUNNING,
                    JobState.TERMINATING
                ])

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url,
                           headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(str(ex))
        raise
示例#50
0
文件: job.py 项目: Kitware/cumulus
def submit_job(cluster, job, log_write_url=None, girder_token=None,
               monitor=True):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)
    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        script_name = job['name']

        with get_connection(girder_token, cluster) as conn:
            job_params = {}
            if 'params' in job:
                job_params = job['params']

            output = conn.execute('pwd')
            if len(output) != 1:
                raise Exception('Unable to fetch users home directory.')

            user_home = output[0].strip()
            job_dir = job_directory(cluster, job, user_home=user_home)
            job['dir'] = job_dir

            slots = -1

            # Try job parameters first
            slots = int(job_params.get('numberOfSlots', slots))

            if slots == -1:
                # Try the cluster
                slots = int(cluster['config'].get('numberOfSlots', slots))

            parallel_env = _get_parallel_env(cluster, job)
            if parallel_env:
                job_params['parallelEnvironment'] = parallel_env

                # If the number of slots has not been provided we will get
                # the number of slots from the parallel environment
                if slots == -1:
                    slots = int(get_queue_adapter(cluster, conn)
                                .number_of_slots(parallel_env))
                    if slots > 0:
                        job_params['numberOfSlots'] = slots

            script = _generate_submission_script(job, cluster, job_params)

            conn.mkdir(job_dir, ignore_failure=True)
            # put the script to master
            conn.put(StringIO(script), os.path.join(job_dir, script_name))

            if slots > -1:
                log.info('We have %s slots available' % slots)

            # Now submit the job
            queue_job_id \
                = get_queue_adapter(cluster, conn).submit_job(job,
                                                              script_name)

            # Update the state and queue job id
            job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id
            patch_data = {
                'status': JobState.QUEUED,
                AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id,
                'dir': job_dir
            }

            r = requests.patch(status_url, headers=headers, json=patch_data)
            check_status(r)
            job = r.json()
            job['queuedTime'] = time.time()

            # Now monitor the jobs progress
            if monitor:
                monitor_job.s(
                    cluster, job, log_write_url=log_write_url,
                    girder_token=girder_token).apply_async(countdown=5)

        # Now update the status of the job
        headers = {'Girder-Token':  girder_token}
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.QUEUED})
        check_status(r)
    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
        raise
示例#51
0
 def test_perms_to_mode(self):
     test_perms = 'drwxr-xr-x'
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             self.assertEqual(conn._perms_to_mode(test_perms), 16877)
示例#52
0
 def tearDown(self):
     try:
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.execute('rm -rf %s' % self._test_case_dir)
     except Exception:
         raise
示例#53
0
文件: job.py 项目: Kitware/cumulus
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None,
                  monitor_interval=5):
    headers = {'Girder-Token':  girder_token}

    cluster_url = '%s/clusters/%s' % (
        cumulus.config.girder.baseUrl, cluster['_id'])
    try:
        with get_connection(girder_token, cluster) as conn:

            try:
                job_queue_states \
                    = get_queue_adapter(cluster, conn).job_statuses(jobs)

                new_states = set()
                for (job, state) in job_queue_states:
                    job_id = job['_id']
                    # First get the current status
                    status_url = '%s/jobs/%s/status' % (
                        cumulus.config.girder.baseUrl, job_id)
                    r = requests.get(status_url, headers=headers)
                    check_status(r)
                    current_status = r.json()['status']

                    if current_status == JobState.TERMINATED:
                        continue

                    job_status = from_string(current_status, task=task,
                                             cluster=cluster, job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = job_status.next(state)
                    job['status'] = str(job_status)
                    job_status.run()
                    json = {
                        'status': str(job_status),
                        'timings': job.get('timings', {}),
                        'output': job['output']
                    }
                    job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl,
                                              job['_id'])
                    r = requests.patch(job_url, headers=headers, json=json)
                    check_status(r)

                    new_states.add(job['status'])

                # Now see if we still have jobs to monitor
                running_states = set(
                    [JobState.CREATED, JobState.QUEUED,
                     JobState.RUNNING, JobState.TERMINATING]
                )

                # Do we have any job still in a running state?
                if new_states & running_states:
                    task.retry(countdown=monitor_interval)
            except EOFError:
                # Try again
                task.retry(countdown=5)
                return
            except paramiko.ssh_exception.NoValidConnectionsError as ex:
                # Try again
                task.retry(countdown=5)
                return
    # Ensure that the Retry exception will get through
    except Retry:
        raise
    except paramiko.ssh_exception.NoValidConnectionsError as ex:
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)

    except Exception as ex:
        traceback.print_exc()
        r = requests.patch(cluster_url, headers=headers,
                           json={'status': 'error'})
        check_status(r)
        get_cluster_logger(cluster, girder_token).exception(ex.message)
        raise
示例#54
0
def monitor_process(task,
                    cluster,
                    job,
                    pid,
                    nohup_out_path,
                    log_write_url=None,
                    on_complete=None,
                    output_message='Job download/upload error: %s',
                    girder_token=None):
    job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id'])
    log = get_post_logger(job['_id'], girder_token, job_url)
    headers = {'Girder-Token': girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # See if the process is still running
            output = conn.execute('ps %s | grep %s' % (pid, pid),
                                  ignore_exit_status=True,
                                  source_profile=False)

            if len(output) > 0:
                # Process is still running so schedule self again in about 5
                # secs
                # N.B. throw=False to prevent Retry exception being raised
                task.retry(throw=False, countdown=5)
            else:
                try:
                    nohup_out_file_name = os.path.basename(nohup_out_path)

                    # Log the output
                    with conn.get(nohup_out_path) as fp:
                        output = fp.read()
                        if output.strip():
                            log.error(output_message % output)
                            # If we have output then set the error state on the
                            # job and return
                            r = requests.patch(status_url,
                                               headers=headers,
                                               json={'status': JobState.ERROR})
                            check_status(r)
                            return
                finally:
                    if nohup_out_file_name and \
                       os.path.exists(nohup_out_file_name):
                        os.remove(nohup_out_file_name)

                # Fire off the on_compete task if we have one
                if on_complete:
                    signature(on_complete).delay()

                # If we where uploading move job to the complete state
                if job['status'] == JobState.UPLOADING:
                    job_status = from_string(job['status'],
                                             task=task,
                                             cluster=cluster,
                                             job=job,
                                             log_write_url=log_write_url,
                                             girder_token=girder_token,
                                             conn=conn)
                    job_status = Complete(job_status)
                    job_status = job_status.next(JobQueueState.COMPLETE)
                    job_status.run()
                    r = requests.patch(status_url,
                                       headers=headers,
                                       json={'status': str(job_status)})
                    check_status(r)

    except EOFError:
        # Try again
        task.retry(throw=False, countdown=5)
    except Exception as ex:
        r = requests.patch(status_url,
                           headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(str(ex))
        raise
示例#55
0
 def test_execute(self):
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             self.assertEqual(conn.execute('ls /bin/ls'), '/bin/ls')
示例#56
0
文件: job.py 项目: Kitware/cumulus
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None,
                              girder_token=None):
    headers = {'Girder-Token':  girder_token}
    job_id = job['_id']
    status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id)

    try:
        # if terminating break out
        if _is_terminating(job, girder_token):
            return

        with get_connection(girder_token, cluster) as conn:
            # First put girder client on master
            path = inspect.getsourcefile(cumulus.girderclient)
            with open(path, 'r') as fp:
                conn.put(fp,
                         os.path.normpath(os.path.join(job_dir, '..',
                                                       os.path.basename(path))))

            cmds = ['cd %s' % job_dir]
            upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \
                         'upload --job %s' \
                         % (girder_token,
                            cumulus.config.girder.baseUrl, job['_id'])

            upload_output = '%s.upload.out' % job_id
            upload_output_path = os.path.normpath(os.path.join(job_dir, '..',
                                                               upload_output))
            cmds.append('nohup %s  &> ../%s  &\n' % (upload_cmd, upload_output))

            upload_cmd = _put_script(conn, '\n'.join(cmds))
            output = conn.execute(upload_cmd)

            # Remove upload script
            conn.remove(upload_cmd)

        if len(output) != 1:
            raise Exception('PID not returned by execute command')

        try:
            pid = int(output[0])
        except ValueError:
            raise Exception('Unable to extract PID from: %s' % output)

        on_complete = None

        if _get_on_complete(job) == 'terminate':
            cluster_log_url = '%s/clusters/%s/log' % \
                (cumulus.config.girder.baseUrl, cluster['_id'])
            on_complete = signature(
                'cumulus.tasks.cluster.terminate_cluster',
                args=(cluster,), kwargs={'log_write_url': cluster_log_url,
                                         'girder_token': girder_token})

        monitor_process.delay(cluster, job, pid, upload_output_path,
                              log_write_url=log_write_url,
                              on_complete=on_complete,
                              girder_token=girder_token)

    except Exception as ex:
        r = requests.patch(status_url, headers=headers,
                           json={'status': JobState.UNEXPECTEDERROR})
        check_status(r)
        get_job_logger(job, girder_token).exception(ex.message)
示例#57
0
def create_geometry_symlink(task, job, cluster, fileName):
    job_dir = job_directory(cluster, job)
    filePath = '%s/%s/%s' % (job_dir, job['input'][0]['path'], fileName)
    linkPath = '%s/%s' % (job_dir, fileName)
    with get_connection(task.taskflow.girder_token, cluster) as conn:
        conn.execute('ln -s %s %s' % (filePath, linkPath))
示例#58
0
 def test_stat(self):
     with httmock.HTTMock(self.me):
         with get_connection(self._girder_token, self._cluster) as conn:
             conn.stat(self.test_case_dir)