def get_assetstore_id(girder_token, cluster): if "assetstoreId" not in cluster: headers = {"Girder-Token": girder_token} url_base = get_assetstore_url_base(cluster) create_url = "%s/%s" % (cumulus.config.girder.baseUrl, url_base) body = { "name": cluster["_id"], "host": cluster["config"]["host"], "machine": cluster["config"]["host"], "authKey": cluster["_id"], } user = parse("config.ssh.user").find(cluster) if user: body["user"] = user[0].value r = requests.post(create_url, json=body, headers=headers) check_status(r) cluster["assetstoreId"] = r.json()["_id"] cluster_url = "%s/clusters/%s" % (cumulus.config.girder.baseUrl, cluster["_id"]) body = {"assetstoreId": cluster["assetstoreId"]} r = requests.patch(cluster_url, json=body, headers=headers) check_status(r) return cluster["assetstoreId"]
def test_connection(cluster, log_write_url=None, girder_token=None): cluster_id = cluster['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: # First fetch the cluster with this 'admin' token so we get the # passphrase filled out. r = requests.get(cluster_url, headers=headers) check_status(r) cluster = r.json() with get_connection(girder_token, cluster) as conn: status = 'running' # Test can we can connect to cluster output = conn.execute('pwd') if len(output) < 1: log.error('Unable connect to cluster') status = 'error' r = requests.patch(cluster_url, headers=headers, json={'status': status}) check_status(r) except Exception as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) # Log the error message log.exception(ex)
def check_ansible_return_code(returncode, cluster, girder_token): if returncode != 0: check_status( requests.patch('%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']), headers={'Girder-Token': girder_token}, json={'status': 'error'}))
def get_assetstore_id(girder_token, cluster): if 'assetstoreId' not in cluster: headers = {'Girder-Token': girder_token} url_base = get_assetstore_url_base(cluster) create_url = '%s/%s' % (cumulus.config.girder.baseUrl, url_base) body = { 'name': cluster['_id'], 'host': cluster['config']['host'], 'machine': cluster['config']['host'], 'authKey': cluster['_id'] } user = parse('config.ssh.user').find(cluster) if user: body['user'] = user[0].value r = requests.post(create_url, json=body, headers=headers) check_status(r) cluster['assetstoreId'] = r.json()['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) body = { 'assetstoreId': cluster['assetstoreId'] } r = requests.patch(cluster_url, json=body, headers=headers) check_status(r) return cluster['assetstoreId']
def check_ansible_return_code(returncode, cluster, girder_token): if returncode != 0: check_status(requests.patch('%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']), headers={'Girder-Token': girder_token}, json={'status': 'error'}))
def launch_cluster(playbook, cluster, profile, secret_key, extra_vars, girder_token, log_write_url, post_status): playbook = get_playbook_path(playbook) playbook_variables = get_playbook_variables(cluster, profile, extra_vars) env = os.environ.copy() env.update({'AWS_ACCESS_KEY_ID': profile['accessKeyId'], 'AWS_SECRET_ACCESS_KEY': secret_key, 'GIRDER_TOKEN': girder_token, 'LOG_WRITE_URL': log_write_url, 'CLUSTER_ID': cluster['_id']}) inventory = simple_inventory('localhost') with inventory.to_tempfile() as inventory_path: ansible = run_playbook(playbook, inventory_path, playbook_variables, env=env, verbose=3) p = CloudProvider(dict(secretAccessKey=secret_key, **profile)) master = p.get_master_instance(cluster['_id']) status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) updates = { 'config': { 'host': master['public_ip'] } } headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json=updates) check_status(r) check_ansible_return_code(ansible, cluster, girder_token) check_girder_cluster_status(cluster, girder_token, post_status)
def test_connection(cluster, log_write_url=None, girder_token=None): cluster_id = cluster['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: # First fetch the cluster with this 'admin' token so we get the # passphrase filled out. r = requests.get(cluster_url, headers=headers) check_status(r) cluster = r.json() with get_connection(girder_token, cluster) as conn: status = 'running' # Test can we can connect to cluster output = conn.execute('pwd') if len(output) < 1: log.error('Unable connect to cluster') status = 'error' r = requests.patch( cluster_url, headers=headers, json={'status': status}) check_status(r) except Exception as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) # Log the error message log.exception(ex)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def _is_terminating(job, girder_token): headers = {'Girder-Token': girder_token} status_url = '%s/jobs/%s/status' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] return current_status in [JobState.TERMINATED, JobState.TERMINATING]
def terminate_job(self, job): url = '%s/queue/%s/%s' % (NEWT_BASE_URL, self._machine, job['queueJobId']) r = self._session.delete(url) check_status(r) json_response = r.json() if json_response['status'] != 'OK' or json_response['error']: raise Exception(json_response['error'])
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def get(self, remote_path): url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path) params = {'view': 'read'} r = None try: r = self._session.get(url, params=params, stream=True) check_status(r) yield r.raw finally: if r: r.close()
def submit_job(self, job, job_script): url = '%s/queue/%s' % (NEWT_BASE_URL, self._machine) job_file_path = os.path.join(job['dir'], job_script) data = {'jobfile': job_file_path} r = self._session.post(url, data=data) check_status(r) json_response = r.json() if json_response['status'] != 'OK' or 'jobid' not in json_response: raise Exception(json_response['error']) return json_response['jobid']
def put(self, stream, remote_path): name = os.path.basename(remote_path) path = os.path.dirname(remote_path) # If not a full path then assume relative to users home if path[0] != '/': # Get the users home directory path = os.path.abspath(os.path.join(self._home_dir(), path)) files = {'file': (name, stream)} url = '%s/file/%s%s' % (NEWT_BASE_URL, self._machine, path) r = self._session.post(url, files=files) check_status(r)
def get_assetstore_id(girder_token, cluster): if 'assetstoreId' in cluster: return cluster['assetstoreId'] headers = {'Girder-Token': girder_token} url_base = get_assetstore_url_base(cluster) create_url = '%s/%s' % (cumulus.config.girder.baseUrl, url_base) body = { 'name': cluster['_id'], 'host': cluster['config']['host'], 'machine': cluster['config']['host'], 'authKey': cluster['_id'] } user = parse('config.ssh.user').find(cluster) if user: body['user'] = user[0].value r = requests.post(create_url, json=body, headers=headers) # If the assetstore has been created, patch the cluster to point to it if r.status_code == 200: assetstore_id = r.json()['_id'] cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) body = { 'assetstoreId': assetstore_id } r = requests.patch(cluster_url, json=body, headers=headers) check_status(r) # The assetstore may have already been created by another concurrently # running task flow. If thats the case, just fetch it and return its id. elif r.status_code == 400: body = r.json() if body.get('field') == 'name' and body.get('type') == 'validation': assetstores_url = '%s/assetstore/lookup' % ( cumulus.config.girder.baseUrl) params = {'name': cluster['_id']} r = requests.get(assetstores_url, params=params, headers=headers) check_status(r) assetstores = r.json() if len(assetstores) == 0: raise Exception( 'Could not find assetstore with name "%s" even though ' 'it should already exist.' % cluster['_id'] ) assetstore_id = assetstores[0]['_id'] else: check_status(r) # Raise any other errors else: check_status(r) cluster['assetstoreId'] = assetstore_id return assetstore_id
def generate_key_pair(cluster, girder_token=None): """ Task to generate a new key pair for a user. """ cluster_id = cluster['_id'] status_url = '%s/clusters/%s' \ % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: new_key = RSAKey.generate(bits=4096) passphrase = ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(64)) key_path = os.path.join(cumulus.config.ssh.keyStore, cluster_id) new_key.write_private_key_file(key_path, password=passphrase) # Allow group read as well os.chmod(key_path, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP) comment = 'cumulus generated access key' public_key = '%s %s %s' % (new_key.get_name(), new_key.get_base64(), comment) # Update passphrase and public key on cluster model config_update = { 'config': { 'ssh': { 'passphrase': passphrase, 'publicKey': public_key } }, 'status': 'created' } patch_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) request = requests.patch(patch_url, json=config_update, headers=headers) check_status(request) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) # Log the error message log.error(ex)
def submit_job(self, job, job_script): url = '%s/queue/%s' % (NEWT_BASE_URL, self._machine) job_file_path = os.path.join(job['dir'], job_script) data = { 'jobfile': job_file_path } r = self._session.post(url, data=data) check_status(r) json_response = r.json() if json_response['status'] != 'OK' or 'jobid' not in json_response: raise Exception(json_response['error']) return json_response['jobid']
def _upload_file(cluster_connection, girder_client, file, path): """ Upload a file to a cluster :param cluster_connection: The connection to access the cluster by. :param girder_client: The Grider client for Girder access. :param file: The Girder file object. :param path: The path on the cluster to upload to. """ r = requests.get( '%s/file/%s/download' % (girder_client.urlBase, file['_id']), headers={'Girder-Token': girder_client.token}, stream=True) check_status(r) cluster_connection.put(r.raw, os.path.join(path, file['name']))
def get(self, remote_path): url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path) params = { 'view': 'read' } r = None try: r = self._session.get(url, params=params, stream=True) check_status(r) yield r.raw finally: if r: r.close()
def terminate_cluster(playbook, cluster, profile, secret_key, extra_vars, girder_token, log_write_url, post_status): playbook = get_playbook_path(playbook) playbook_variables = get_playbook_variables(cluster, profile, extra_vars) env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': profile['accessKeyId'], 'AWS_SECRET_ACCESS_KEY': secret_key, 'GIRDER_TOKEN': girder_token, 'LOG_WRITE_URL': log_write_url, 'CLUSTER_ID': cluster['_id'] }) # if there are any volumes, make sure to detach them first. if 'volumes' in cluster and len(cluster['volumes']): p = CloudProvider(dict(secretAccessKey=secret_key, **profile)) master = p.get_master_instance(cluster['_id']) for volume_id in cluster['volumes']: r = requests.get('%s/volumes/%s' % (cumulus.config.girder.baseUrl, volume_id), headers={'Girder-Token': girder_token}) check_status(r) volume = r.json() girder_callback_info = { 'girder_api_url': cumulus.config.girder.baseUrl, 'girder_token': girder_token } vol_log_url = '%s/volumes/%s/log' % (cumulus.config.girder.baseUrl, volume_id) detach_volume(profile, cluster, master, volume, secret_key, vol_log_url, girder_callback_info) inventory = simple_inventory('localhost') with inventory.to_tempfile() as inventory_path: ansible = run_playbook(playbook, inventory_path, playbook_variables, env=env, verbose=3) check_ansible_return_code(ansible, cluster, girder_token) check_girder_cluster_status(cluster, girder_token, post_status)
def _upload_file(cluster_connection, girder_client, file, path): """ Upload a file to a cluster :param cluster_connection: The connection to access the cluster by. :param girder_client: The Grider client for Girder access. :param file: The Girder file object. :param path: The path on the cluster to upload to. """ r = requests.get('%s/file/%s/download' % (girder_client.urlBase, file['_id']), headers={'Girder-Token': girder_client.token}, stream=True) check_status(r) cluster_connection.put(r.raw, os.path.join(path, file['name']))
def put(self, stream, remote_path): name = os.path.basename(remote_path) path = os.path.dirname(remote_path) # If not a full path then assume relative to users home if path[0] != '/': # Get the users home directory path = os.path.abspath(os.path.join(self._home_dir(), path)) files = { 'file': (name, stream) } url = '%s/file/%s%s' % (NEWT_BASE_URL, self._machine, path) r = self._session.post(url, files=files) check_status(r)
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None, girder_token=None): status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) headers = {'Girder-Token': girder_token} assetstore_base_url = get_assetstore_url_base(cluster) assetstore_id = get_assetstore_id(girder_token, cluster) if not job_dir: job_dir = job['dir'] try: with get_connection(girder_token, cluster) as conn: for output in job['output']: if 'folderId' in output and 'path' in output: folder_id = output['folderId'] path = os.path.join(job_dir, output['path']) download_path(conn, girder_token, folder_id, path, assetstore_base_url, assetstore_id) except HttpError as e: job['status'] = JobState.ERROR url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) logger = get_post_logger('job', girder_token, url) logger.exception(e.responseText) r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) command.send_task( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=None, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r)
def execute(self, command, ignore_exit_status=False, source_profile=True): url = '%s/command/%s' % (NEWT_BASE_URL, self._machine) # NEWT requires all commands are issued using a full executable path for (name, full_path) in six.iteritems(commands): command = re.sub(r'^%s[ ]*' % name, '%s ' % full_path, command) data = {'executable': command, 'loginenv': source_profile} r = self._session.post(url, data=data) check_status(r) json_response = r.json() if json_response['error']: raise NewtException(json_response['error']) return json_response['output'].split('\n')
def check_girder_cluster_status(cluster, girder_token, post_status): # Check status from girder cluster_id = cluster['_id'] headers = {'Girder-Token': girder_token} status_url = '%s/clusters/%s/status' % (cumulus.config.girder.baseUrl, cluster_id) r = requests.get(status_url, headers=headers) status = r.json()['status'] if status != 'error': # Update girder with the new status status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) updates = {'status': post_status} r = requests.patch(status_url, headers=headers, json=updates) check_status(r)
def generate_key_pair(cluster, girder_token=None): ''' Task to generate a new key pair for a user. ''' cluster_id = cluster['_id'] status_url = '%s/clusters/%s' \ % (cumulus.config.girder.baseUrl, cluster_id) log = get_cluster_logger(cluster, girder_token) headers = {'Girder-Token': girder_token} try: new_key = RSAKey.generate(bits=4096) passphrase = ''.join(random.SystemRandom() .choice(string.ascii_uppercase + string.digits) for _ in range(64)) key_path = os.path.join(cumulus.config.ssh.keyStore, cluster_id) new_key.write_private_key_file(key_path, password=passphrase) # Allow group read as well os.chmod(key_path, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP) comment = 'cumulus generated access key' public_key = '%s %s %s' % (new_key.get_name(), new_key.get_base64(), comment) # Update passphrase and public key on cluster model config_update = { 'config': { 'ssh': { 'passphrase': passphrase, 'publicKey': public_key } }, 'status': 'created' } patch_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) request = requests.patch(patch_url, json=config_update, headers=headers) check_status(request) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) # Log the error message log.error(ex.message)
def check_girder_cluster_status(cluster, girder_token, post_status): # Check status from girder cluster_id = cluster['_id'] headers = {'Girder-Token': girder_token} status_url = '%s/clusters/%s/status' % (cumulus.config.girder.baseUrl, cluster_id) r = requests.get(status_url, headers=headers) status = r.json()['status'] if status != 'error': # Update girder with the new status status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster_id) updates = { 'status': post_status } r = requests.patch(status_url, headers=headers, json=updates) check_status(r)
def list(self, remote_path): if remote_path[0] != '/': # Get the users home directory remote_path = os.path.abspath(os.path.join(self._home_dir(), remote_path)) url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path) r = self._session.get(url) check_status(r) paths = r.json() for path in paths: perms = path['perms'] del path['perms'] del path['hardlinks'] path['mode'] = self._perms_to_mode(perms) yield path
def __enter__(self): # Do we need to get the session id for this user if not self._newt_session_id: headers = {'Girder-Token': self._girder_token} url = '%s/newt/sessionId' % cumulus.config.girder.baseUrl r = requests.get(url, headers=headers) check_status(r) session_id = parse('sessionId').find(r.json()) if not session_id: raise Exception('No NEWT session ID present') self._session = requests.Session() self._newt_session_id = session_id[0].value self._session.cookies.set('newt_sessionid', self._newt_session_id) return self
def list(self, remote_path): if remote_path[0] != '/': # Get the users home directory remote_path = os.path.abspath( os.path.join(self._home_dir(), remote_path)) url = '%s/file/%s/%s' % (NEWT_BASE_URL, self._machine, remote_path) r = self._session.get(url) check_status(r) paths = r.json() for path in paths: perms = path['perms'] del path['perms'] del path['hardlinks'] path['mode'] = self._perms_to_mode(perms) path['size'] = int(path['size']) yield path
def job_statuses(self, jobs): user = parse('config.user').find(self._cluster) if not user: raise Exception('Unable to extract user from cluster ' 'configuration.') user = user[0].value url = '%s/queue/%s?user=%s' % (NEWT_BASE_URL, self._machine, user) r = self._session.get(url) check_status(r) json_response = r.json() states = [] for job in jobs: slurm_state = self._extract_job_status(json_response, job) state = self.to_job_queue_state(slurm_state) states.append((job, state)) return states
def execute(self, command, ignore_exit_status=False, source_profile=True): url = '%s/command/%s' % (NEWT_BASE_URL, self._machine) # NEWT requires all commands are issued using a full executable path for (name, full_path) in commands.iteritems(): command = re.sub(r'^%s[ ]*' % name, '%s ' % full_path, command) data = { 'executable': command, 'loginenv': source_profile } r = self._session.post(url, data=data) check_status(r) json_response = r.json() if json_response['error']: raise NewtException(json_response['error']) return json_response['output'].split('\n')
def terminate_cluster(playbook, cluster, profile, secret_key, extra_vars, girder_token, log_write_url, post_status): playbook = get_playbook_path(playbook) playbook_variables = get_playbook_variables(cluster, profile, extra_vars) env = os.environ.copy() env.update({'AWS_ACCESS_KEY_ID': profile['accessKeyId'], 'AWS_SECRET_ACCESS_KEY': secret_key, 'GIRDER_TOKEN': girder_token, 'LOG_WRITE_URL': log_write_url, 'CLUSTER_ID': cluster['_id']}) # if there are any volumes, make sure to detach them first. if 'volumes' in cluster and len(cluster['volumes']): p = CloudProvider(dict(secretAccessKey=secret_key, **profile)) master = p.get_master_instance(cluster['_id']) for volume_id in cluster['volumes']: r = requests.get('%s/volumes/%s' % (cumulus.config.girder.baseUrl, volume_id), headers={'Girder-Token': girder_token}) check_status(r) volume = r.json() girder_callback_info = { 'girder_api_url': cumulus.config.girder.baseUrl, 'girder_token': girder_token} detach_volume(profile, cluster, master, volume, secret_key, girder_callback_info) inventory = simple_inventory('localhost') with inventory.to_tempfile() as inventory_path: ansible = run_playbook(playbook, inventory_path, playbook_variables, env=env, verbose=3) check_ansible_return_code(ansible, cluster, girder_token) check_girder_cluster_status(cluster, girder_token, post_status)
def launch_cluster(playbook, cluster, profile, secret_key, extra_vars, girder_token, log_write_url, post_status): playbook = get_playbook_path(playbook) playbook_variables = get_playbook_variables(cluster, profile, extra_vars) env = os.environ.copy() env.update({ 'AWS_ACCESS_KEY_ID': profile['accessKeyId'], 'AWS_SECRET_ACCESS_KEY': secret_key, 'GIRDER_TOKEN': girder_token, 'LOG_WRITE_URL': log_write_url, 'CLUSTER_ID': cluster['_id'] }) inventory = simple_inventory('localhost') with inventory.to_tempfile() as inventory_path: ansible = run_playbook(playbook, inventory_path, playbook_variables, env=env, verbose=3) p = CloudProvider(dict(secretAccessKey=secret_key, **profile)) master = p.get_master_instance(cluster['_id']) status_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) updates = {'config': {'host': master['public_ip']}} headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json=updates) check_status(r) check_ansible_return_code(ansible, cluster, girder_token) check_girder_cluster_status(cluster, girder_token, post_status)
def generate_key_pair(aws_profile, girder_token): try: client = get_ec2_client(aws_profile) key_path = _key_path(aws_profile) key_pair = client.create_key_pair(KeyName=aws_profile['_id']) with open(key_path, 'wb') as fp: fp.write(key_pair['KeyMaterial'].encode('utf8')) os.chmod(key_path, stat.S_IRUSR) aws_profile['status'] = 'available' except Exception as ex: aws_profile['status'] = 'error' aws_profile['errorMessage'] = '%s: %s' % (type(ex).__name__, ex) traceback.print_exc() update_url = '%s/user/%s/aws/profiles/%s' % (cumulus.config.girder.baseUrl, aws_profile['userId'], aws_profile['_id']) headers = {'Girder-Token': girder_token} r = requests.patch(update_url, json=aws_profile, headers=headers) check_status(r)
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put( fp, os.path.normpath( os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath( os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature('cumulus.tasks.cluster.terminate_cluster', args=(cluster, ), kwargs={ 'log_write_url': cluster_log_url, 'girder_token': girder_token }) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int(get_queue_adapter(cluster, conn) .number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.mkdir(job_dir, ignore_failure=True) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set([ JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING ]) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) raise
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % ( cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set( [JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING] ) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError as ex: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) raise
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.normpath(os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath(os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int( get_queue_adapter(cluster, conn).number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.makedirs(job_dir) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise