def __init__(self, cloud=None): config = Config().data self.cloud = cloud self.driver = None self.key = None if cloud: data = config.get('cloudmesh') self.os_config = data.get('cloud').get(cloud) self.os_config = config.get('cloudmesh').get('cloud').get(cloud) self.driver = self.get_driver(cloud) self.key = self.os_config.get('credentials').get( 'OS_KEY_PATH') # credentials.target return null string # if we don't find OS_KEY_PATH in yaml, go to os.environ instead which can be set in .bashrc if self.key is None: self.key = os.environ['OS_KEY_PATH'] else: self.os_config = config
def test_set(self): before = self._conf.get("default.cloud") self._conf.set("default.cloud", "testcloud") new_config = Config() after = new_config.get("default.cloud") assert before != after new_config.set("default.cloud", before)
def process_arguments(arguments): provider = arguments['--provider'] config = Config() if not provider: provider = config.get("default.cloud") if provider == "aws": cloud_manager = AWSProvider(config) else: cloud_manager = AWSProvider(config) if arguments['start']: cloud_manager.start() elif arguments['stop']: cloud_manager.stop() elif arguments['status']: cloud_manager.status()
class TestConfig: """ Functional tests for the configuration Config class """ def __init__(self): self._conf = None def setup(self): self._conf = Config() def test_get_notfound_defaults(self): assert self._conf.get("nothere") is None assert self._conf.get("nothere", {}) == {} assert self._conf.get("default.nothere") is None custom_default = {"foo": "bar"} assert self._conf.get("default.nothere", custom_default) == custom_default def test_get_shorthand(self): raw_result = self._conf._cloudmesh.get("default").get("cloud") default_result = self._conf.get("default").get("cloud") short_result = self._conf.get("default.cloud") assert short_result == default_result == raw_result az_conf = self._conf.get("cloud.azure") az_id = az_conf.get('credentials.AZURE_SUBSCRIPTION_ID') assert az_id is not None def test_set(self): before = self._conf.get("default.cloud") self._conf.set("default.cloud", "testcloud") new_config = Config() after = new_config.get("default.cloud") assert before != after new_config.set("default.cloud", before)
from flask import request, jsonify from cm4.mongo.mongoDB import MongoDB from cm4.configuration.config import Config from cm4.vm.Vm import Vm config = Config() db = MongoDB(config.get('data.mongo.MONGO_DBNAME'), config.get('data.mongo.MONGO_USERNAME'), config.get('data.mongo.MONGO_PASSWORD'), config.get('data.mongo.MONGO_PORT')) db.connect_db() def vm_list(): cloud = request.args.get('cloud') if cloud: rep = Vm(cloud).list() return 'No node is found on {}!\n'.format(cloud) if not rep else \ jsonify(**{'records': [db.var_to_json(x.__dict__) for x in rep]}) else: return jsonify(**{'records': [db.var_to_json(x) for x in db.db['cloud'].find()]})
class OpenstackCM(CloudManagerABC): # common def __init__(self, cloud=None): config = Config().data self.cloud = cloud self.driver = None self.key = None if cloud: self.os_config = config.get('cloudmesh').get('cloud').get(cloud) self.driver = self.get_driver(cloud) self.key = self.os_config.get('credentials').get( 'OS_KEY_PATH') # credentials.target return null string # if we don't find OS_KEY_PATH in yaml, go to os.environ instead which can be set in .bashrc if self.key is None: self.key = os.environ['OS_KEY_PATH'] else: self.os_config = config def _get_obj_list(self, obj_type): obj_list = None if obj_type == 'node': obj_list = self.driver.list_nodes() elif obj_type == 'image': obj_list = self.driver.list_images() elif obj_type == 'size': obj_list = self.driver.list_sizes() elif obj_type == 'ip': obj_list = self.driver.ex_list_floating_ips() return obj_list def _get_obj_by_name(self, obj_type, obj_name): obj_list = self._get_obj_list(obj_type) for o in obj_list: if o.name == obj_name: return o def _get_obj_by_id(self, obj_type, obj_id): obj_list = self._get_obj_list(obj_type) for o in obj_list: if o.id == obj_id: return o def _get_node_by_id(self, node_id): return self._get_obj_by_id('node', node_id) def get_driver(self, cloud=None): if not cloud: raise ValueError('Cloud arguement is not properly configured') if not self.driver: self.driver = self.get_driver_helper(cloud) return self.driver def get_driver_helper(self, cloud): credential = self.os_config.get("credentials") openstack = get_driver(Provider.OPENSTACK) driver = openstack( credential.get('OS_USERNAME'), credential.get('OS_PASSWORD') or os.environ['OS_PASSWORD'], ex_force_auth_url=credential.get("OS_AUTH_URL"), ex_force_auth_version='2.0_password', ex_tenant_name=credential.get("OS_TENANT_NAME"), ex_force_service_region=credential.get("OS_REGION_NAME")) return driver def set_cloud(self, cloud): """ switch to another cloud provider :param cloud: target provider :return: """ self.cloud = cloud self.os_config = Config().get('cloud.{}'.format(cloud)) def _get_public_ip(self): ips = [x for x in self._get_obj_list('ip') if not x.node_id] # print(self._get_obj_list('ip')) # print(ips[0].node_id) return ips[0] if ips else None # API hack for new VM class def ex_start_node(self, node): return self.driver.ex_start_node(node) def ex_stop_node(self, info, deallocate): return self.driver.ex_stop_node(info) def destroy_node(self, node): return self.driver.destroy_node(node) def create_node(self, name): return self.create(name) def list_nodes(self): return self.driver.list_nodes() # APIs def execute(self, name, command): """ execute arbitrary shell command on node through ssh ssh funcionality must available on the local machine :param name: name of the VM :param command: shell command """ node = self._get_obj_by_name('node', name) template = 'ssh -i {key} -o StrictHostKeyChecking=no {user}@{host} "{command}"' kwargs = { 'key': os.path.splitext(self.key)[0], 'user': self.os_config.get('default.username'), 'host': node.public_ips[0], 'command': command } try: res = subprocess.check_output(template.format(**kwargs), shell=True, input=b'\n', stderr=subprocess.STDOUT) return res.decode('utf8') except Exception as e: return e def set_public_ip(self, name, ip_str): """ :param name: name of the VM :param ip_str: ip string """ node = self._get_obj_by_name('node', name) ip_obj = self.driver.ex_get_floating_ip(ip_str) if ip_obj and not ip_obj.node_id: self.driver.ex_attach_floating_ip_to_node(node, ip_obj) elif ip_obj and ip_obj.node_id: raise EnvironmentError( 'Public IP has been assigned to another machine. Pick another ip' ) else: raise ValueError('Public IP addresss does not exist!') def remove_public_ip(self, name): """ :param name: name of the VM """ node = self._get_obj_by_name('node', name) for ip in node.public_ips: self.driver.ex_detach_floating_ip_from_node(node, ip) # standard functions def ls(self): """ list all nodes :return: list of id, name, state """ nodes = self.driver.list_nodes() return [dict(id=i.id, name=i.name, state=i.state) for i in nodes] def list_available_ips(self): index = 0 for x in self._get_obj_list('ip'): if not x.node_id: print("available ip_{}: {}".format(index, x)) index += 1 def nodes_info(self): """ get organized meta information about all node :return: metadata of node """ nodes = self.driver.list_nodes() res = {} for i in nodes: res[i.id] = dict( id=i.id, name=i.name, state=i.state, public_ips=i.public_ips, private_ips=i.private_ips, size=i.size, image=i.image, created_date=i.created_at.strftime("%Y-%m-%d %H:%M:%S"), extra=i.extra) return res def info(self, node_id): """ get meta information about one node :param node_id: :return: metadata of node """ node = self._get_node_by_id(node_id) return dict(id=node.id, name=node.name, state=node.state, public_ips=node.public_ips, private_ips=node.private_ips, size=node.size, image=node.image, created_date=node.created_at.strftime("%Y-%m-%d %H:%M:%S"), extra=node.extra) def create(self, name, image=None, size=None, timeout=300, **kwargs): # get defualt if needed image_name = image if image else self.os_config.get('default').get( 'image') size_name = size if size else self.os_config.get('default').get( 'flavor') # add to kwargs kwargs['name'] = name kwargs['image'] = self._get_obj_by_name('image', image_name) kwargs['size'] = self._get_obj_by_name('size', size_name) if self.key: try: self.driver.import_key_pair_from_file(name, self.key) except Exception as e: print(e) print( "If exception code is 409 Conflict Key pair is already exists, we can still proceed without key importation" ) kwargs['ex_keyname'] = name # create node node = self.driver.create_node(**kwargs) # attach ip if available # in case of error, need timeout to make sure we do attachment after the node has been spawned ip = self._get_public_ip() if ip: timeout_counter = 0 while self.info(node.id)['state'] != 'running': if timeout_counter > timeout: print( "Node is being spawned for too long, float ip association is failed" ) return node sleep(3) timeout_counter += 3 self.driver.ex_attach_floating_ip_to_node(node, ip) return node def start(self, node_id): """ start the node :param node_id: :return: True/False """ node = self._get_node_by_id(node_id) return self.driver.ex_start_node(node) def stop(self, node_id): """ stop the node :param node_id: :return: """ node = self._get_node_by_id(node_id) return self.driver.ex_stop_node(node) def suspend(self, node_id): """ suspend the node :param node_id: :return: True/False """ node = self._get_node_by_id(node_id) return self.driver.ex_suspend_node(node) def resume(self, node_id): """ resume the node :param node_id: :return: True/False """ node = self._get_node_by_id(node_id) return self.driver.ex_resume_node(node) def reboot(self, node_id): """ resume the node :param node_id: :return: True/False """ node = self._get_node_by_id(node_id) return self.driver.reboot_node(node) def destroy(self, node_id): """ delete the node :param node_id: :return: True/False """ node = self._get_node_by_id(node_id) return self.driver.destroy_node(node, )
class SlurmCluster(object): def __init__(self, debug): """ Initializes the SlurmCluster class :param debug: switch the debug information on and off """ current_path = os.path.dirname(os.path.realpath(__file__)) self.workspace = os.path.join(current_path, "batch_workspace/slurm_batch.yaml") if not os.path.exists(os.path.dirname(self.workspace)): os.makedirs(os.path.dirname(self.workspace)) self.cm_config = Config() self.batch_config = GenericConfig(self.workspace) self.debug = debug self.all_jobIDs = [] self.slurm_cluster = {} self.job_metadata = {} def create(self, job_name, slurm_cluster_name, slurm_script_path, input_type, job_script_path, argfile_path, remote_path, local_path, suffix, overwrite=False): """ This method is used to create a job for running on remote slurm cluster :param job_name: name of the job to create :param slurm_cluster_name: slurm cluster on which the job is gonna run :param slurm_script_path: path of the slurm script :param input_type: type of the input for the script that is going to be run on remote cluster, possible values: params, params+file :param job_script_path: path of the file that is going to be run on the cluster via slurm script :param argfile_path: path of the file that has to be passed to the file as an argument if any :param remote_path: path in the remotes on which the scripts is gonna be copied to and ran from :param local_path: local path to which the results are gonna be copied :param suffix: suffix of the filenames in the job :param overwrite: if the job already exists, this flag overwrites the previous job with the same name :return: """ if self.batch_config.get('job-metadata') is not None and job_name in \ list(self.batch_config.get('job-metadata').keys()) and overwrite is False: raise RuntimeError( "The job {} exists in the configuration file, if you want to overwrite the job, \ use --overwrite argument.".format(job_name)) slurm_cluster = self.cm_config.get('cloudmesh').get( 'cluster')[slurm_cluster_name] tmp_cluster = {slurm_cluster_name: dict(slurm_cluster)} self.batch_config.deep_set(['slurm_cluster'], tmp_cluster) job_metadata = {job_name: {}} job_metadata[job_name]['suffix'] = suffix job_metadata[job_name]['slurm_cluster_name'] = slurm_cluster_name job_metadata[job_name]['input_type'] = input_type job_metadata[job_name]['raw_remote_path'] = remote_path job_metadata[job_name]['slurm_script_path'] = os.path.abspath( slurm_script_path) job_metadata[job_name]['job_script_path'] = os.path.abspath( job_script_path) job_metadata[job_name]['argfile_path'] = os.path.abspath(argfile_path) job_metadata[job_name]['argfile_name'] = ntpath.basename(argfile_path) job_metadata[job_name]['script_name'] = ntpath.basename( job_script_path) job_metadata[job_name]['slurm_script_name'] = ntpath.basename( slurm_script_path) job_metadata[job_name]['remote_path'] = os.path.join( remote_path, 'job' + suffix, '') job_metadata[job_name]['remote_script_path'] = os.path.join( job_metadata[job_name]['remote_path'], job_metadata[job_name]['script_name']) job_metadata[job_name]['remote_slurm_script_path'] = os.path.join( job_metadata[job_name]['remote_path'], job_metadata[job_name]['slurm_script_name']) job_metadata[job_name]['local_path'] = local_path self.job_metadata = job_metadata[job_name] self.batch_config.deep_set(['job-metadata'], job_metadata) @staticmethod def _execute_in_parallel(func_args): """ This is a method used for running methods in parallel :param func_args: :return: """ target_class = func_args[0] method_to_call = getattr(target_class, func_args[1]) args = list(func_args[2:]) return method_to_call(*args) def _fetch_results_in_parallel(self, job_metadata, job_id, all_job_ids): """ This method is used to fetch the results from remote nodes. :param job_metadata: the dictionary containing the information about the previously submitted job :param job_id: the tuple containing destination node, destination pid and destination node index when the job was submitted :param all_job_ids: :return: """ dest_node_info = self.slurm_cluster dest_job_id = job_id ssh_caller = lambda *x: self._ssh(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \ ['sshconfigpath']), *x) scp_caller = lambda *x: self._scp(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \ ['sshconfigpath']), *x) ps_output = ssh_caller("qstat -u $USER | grep %s" % job_id) if len(ps_output) == 0 or ' c ' in ps_output.lower(): if not os.path.exists(job_metadata['local_path']): os.makedirs(job_metadata['local_path']) scp_caller( '-r', '%s:%s' % (dest_node_info['name'], job_metadata['remote_path']), os.path.join(job_metadata['local_path'], '')) os.remove( os.path.join( job_metadata['local_path'], os.path.basename( os.path.normpath(job_metadata['remote_path'])), job_metadata['script_name'])) os.remove( os.path.join( job_metadata['local_path'], os.path.basename( os.path.normpath(job_metadata['remote_path'])), job_metadata['slurm_script_name'])) if job_metadata['input_type'] == 'params+file': os.remove( os.path.join( job_metadata['local_path'], os.path.basename( os.path.normpath(job_metadata['remote_path'])), job_metadata['argfile_name'])) all_job_ids.remove(dest_job_id) print("Results collected from %s for jobID %s" % (dest_node_info['name'], dest_job_id)) @staticmethod def _ssh(hostname, sshconfigpath, *args): """ This method is used to create remove ssh connections :param hostname: hostname :param sshconfigpath: path to sshconfig for connecting to remote node :param args: the argument to be submitted via ssh :return: """ hide_errors_flag = False if type(args[-1]) == bool: hide_errors_flag = True args = args[:-1] ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = ssh.stdout.readline() if not result: error = ssh.stderr.readlines() if len(error) > 0 and hide_errors_flag == False: print("ERROR in host %s: %s" % (hostname, error)) return [] else: try: return ''.join([chr(x) for x in result]) except AttributeError: return [result.decode('utf-8').strip('\n')] @staticmethod def _scp(hostname, sshconfigpath, *args): """ This method is used for scp from and to remote :param hostname: hostname :param sshconfigpath: ssh config file :param args:arguments for using while copying :return: """ ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE) middle_result = ssh.stdout.readlines() if not middle_result: error = ssh.stderr.readlines() if len(error) > 0: print("ERROR in host %s: %s" % (hostname, error)) @staticmethod def add_suffix_to_path(path, suffix): """ This method is used to add suffix to a path :param path: path :param suffix: suffix :return: """ dir_path = os.path.dirname(path) full_filename = os.path.basename(path) filename, fileextention = os.path.splitext(full_filename) full_filename_new = filename + suffix + fileextention new_path = os.path.join(dir_path, full_filename_new) return new_path def clean_remote(self, job_name): """ This method is used to spawn processes for cleaning the remote nodes :param job_name: name of previously submitted job for which the nodes are going to be cleaned :return: """ job_metadata = self.batch_config.get('job-metadata')[job_name] target_cluster_info = self.batch_config.get('slurm_cluster')[ job_metadata['slurm_cluster_name']] remote_path = job_metadata['remote_path'] ssh_caller = lambda *x: self._ssh(target_cluster_info['name'], os.path.expanduser(target_cluster_info['credentials'] \ ['sshconfigpath']), *x, True) ssh_caller('rm -rf {}'.format(remote_path)) if len(ssh_caller('ls {}'.format(remote_path))) == 0: print("Job {} cleaned successfully.".format(job_name)) else: print("Error: Job {} could not be cleaned.".format(job_name)) def connection_test(self, slurm_cluster_name): """ This method is used for testing the connection to the slurm cluster connection node :param slurm_cluster_name: name of the slurm cluster which is going to be tested :return: """ target_node_info = self.batch_config.get( 'slurm_cluster')[slurm_cluster_name] ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \ ['sshconfigpath']), *x) if len(ssh_caller('uname -a')) > 0: print("Slurm Cluster {} is accessible.".format( target_node_info['name'])) else: print("Error: Slurm Cluster {} cannot be accessed.".format( target_node_info['name'])) def remove(self, target, key): """ Used to remove virtual clusters and runtime configs :param target: type of entity to be removed :param key: keyname of the entity to be removed :return: """ if target == 'slurm-cluster': self.batch_config.remove(['slurm_cluster'], key) print("Slurm-cluster {} removeed successfully.".format(key)) elif target == 'job': self.batch_config.remove(['job-metadata'], key) print("Job {} removeed successfully.".format(key)) else: raise ValueError("Target of removeing not found.") def fetch(self, job_name): """ This method is used to fetch resutls from remote nodes :param job_name: the previously submitted job name :return: """ job_metadata = self.batch_config.get('job-metadata')[job_name] self.slurm_cluster = self.batch_config.get('slurm_cluster')[ job_metadata['slurm_cluster_name']] loaded_all_job_ids = [x for x in job_metadata['jobIDs']] all_job_ids = Manager().list() all_job_ids.extend(loaded_all_job_ids) pool = Pool(processes=1) print("collecting results") while len(all_job_ids) > 0: time.sleep(1) all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, jobID, all_job_ids) for \ jobID in loaded_all_job_ids if jobID in all_job_ids] pool.map(self._execute_in_parallel, all_running_jobs) print("waiting for other results if any...") print("All of the remote results collected.") def list(self, target, max_depth, current_depth=1, input_dict=None): """ listing the target slurm clusters or job-metadata :param target: name of the virtual cluster to be listed :param max_depth: depth of information to be shown :param current_depth: current depth of printing information :param input_dict: used for recursion for depth of higher than 1 :return: """ if target == 'slurm-clusters' and input_dict is None: input_dict = self.batch_config.get('slurm_cluster') if target == 'jobs' and input_dict is None: input_dict = self.batch_config.get('job-metadata') elif input_dict is None: raise ValueError("Target of listing not found.") if max_depth >= current_depth: if type(input_dict) == dict: for key in input_dict: key_to_print = key + ':' if max_depth >= current_depth else key indent = current_depth if current_depth > 1 else current_depth - 1 print('\t' * indent, key_to_print) if type(input_dict.get(key)) != dict: print('\t' * (indent + 1), input_dict.get(key)) else: for value in input_dict.get(key): value_to_print = value + ':' if max_depth > current_depth else value print('\t' * (indent + 1), value_to_print) self.list(target, max_depth, input_dict=input_dict[key][value], current_depth=current_depth + 1) else: indent = current_depth if current_depth > 1 else current_depth - 1 print('\t' * indent, input_dict) def run(self, job_name): """ This method is used to create a job, validate it and run it on remote nodes :param job_name: name of the job to create :return: """ job_metadata = self.batch_config.get('job-metadata')[job_name] all_job_ids = Manager().list() cluster_name = job_metadata['slurm_cluster_name'] slurm_cluster = self.batch_config.get('slurm_cluster').get( cluster_name) ssh_caller = lambda *x: self._ssh(slurm_cluster['name'], os.path.expanduser(slurm_cluster['credentials'] \ ['sshconfigpath']), *x) scp_caller = lambda *x: self._scp(slurm_cluster['name'], os.path.expanduser(slurm_cluster['credentials'] \ ['sshconfigpath']), *x) ssh_caller( 'cd %s && mkdir job%s' % (job_metadata['raw_remote_path'], job_metadata['suffix']), True) scp_caller( job_metadata['slurm_script_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_slurm_script_path'])) scp_caller( job_metadata['job_script_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_script_path'])) ssh_caller('chmod +x', job_metadata['remote_script_path']) if job_metadata['input_type'].lower() == 'params+file': scp_caller( job_metadata['argfile_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_path'])) remote_job_id = ssh_caller( "cd %s && qsub %s && qstat -u $USER | tail -n 1 | awk '{print $1}'" % (job_metadata['remote_path'], job_metadata['remote_slurm_script_path'])) remote_job_id = remote_job_id.strip('\n') all_job_ids.append(remote_job_id) print('Remote job ID: %s' % remote_job_id) self.batch_config.deep_set(['job-metadata', job_name, 'jobIDs'], [pid for pid in all_job_ids]) def set_param(self, target, name, parameter, value): """ Used to set a specific parameter in the configuration :param target: the entity type on which the parameter is going to be set, e.g. runtime-config :param name: the entity name on which the parameter is going to be set, e.g. test-config32 :param parameter: name of the parameter to be set :param value: value of that parameter to be set :return: """ if target == 'slurm-cluster': self.batch_config.deep_set(['slurm_cluster', name, parameter], value) print("slurm-cluster parameter {} set to {} successfully.".format( parameter, value)) elif target == 'job-metadata': self.batch_config.deep_set(['job-metadata', name, parameter], value) print("Job-metadata parameter {} set to {} successfully.".format( parameter, value)) else: raise ValueError("Target of variable set not found.")
class VirtualCluster(object): def __init__(self, debug): """ Initializes the virtualcluster class :param debug: switch the debug information on and off """ current_path = os.path.dirname(os.path.realpath(__file__)) self.workspace = os.path.join(current_path, "vcluster_workspace/vcluster.yaml") if not os.path.exists(os.path.dirname(self.workspace)): os.makedirs(os.path.dirname(self.workspace)) self.cm_config = Config() self.vcluster_config = GenericConfig(self.workspace) self.debug = debug self.all_pids = [] self.virt_cluster = {} self.runtime_config = {} self.job_metadata = {} def _config_validator(self): """ validates the configuration of a run based on the information about its virtual cluster, runtime configuration and the job metadata :return: """ job_metadata = self.job_metadata virt_cluster = self.virt_cluster runtime_config = self.runtime_config for node in virt_cluster: if 'name' not in virt_cluster[node].keys(): raise ValueError(" node {}: 'name' keyword, indicating hostname is missing from".format(node)) if 'sshconfigpath' not in virt_cluster[node]['credentials'].keys(): raise ValueError("%s: 'sshconfigpath' keyword is missing" % node) if not os.path.isfile(os.path.expanduser(virt_cluster[node]['credentials']['sshconfigpath'])): raise ValueError("%s: The ssh config file %s does not exists" % (node, virt_cluster[node] \ ['credentials']['sshconfigpath'])) if not os.path.isfile(os.path.expanduser(job_metadata['script_path'])): raise ValueError("The script file %s does not exists" % (job_metadata['script_path'])) if runtime_config['input-type'] == 'params+file': if not os.path.isfile(os.path.expanduser(job_metadata['argfile_path'])): raise ValueError("The arg file %s does not exists" % (job_metadata['arg_file_path'])) def _clean_remote_in_parallel(self, target_node, remote_path): """ This method is used to spawn processes to clean the remotes of a particular job. :param target_node: the node on which the data is going to be removed :param remote_path: path of the data to be removed :return: """ target_node_info = self.virt_cluster[target_node] ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \ ['sshconfigpath']), *x) ssh_caller('rm -rf {}'.format(remote_path)) if len(ssh_caller('ls {}'.format(remote_path))) == 0: print("Node {} cleaned successfully.".format(target_node)) else: print("Error: Node {} could not be cleaned.".format(target_node)) def _connection_test_in_parallel(self, target_node): """ This method is used to test the connection to cluster nodes in parallel :param target_node: the node to which the connection is going to be tested :return: """ target_node_info = self.virt_cluster[target_node] ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \ ['sshconfigpath']), *x) if len(ssh_caller('uname -a')) > 0: print("Node {} is accessible.".format(target_node)) else: print("Error: Node {} cannot be accessed.".format(target_node)) def _create_config(self, config_name, proc_num, download_proc_num, download_later, input_type, output_type): """ This method is used to create a runtime-configuration. :param config_name: name of the runtime configuration :param proc_num: number of processes to be spawned in that runtime for submitting the jobs :param download_proc_num: number number of processes to be spawned in that runtime for fetching the results :param download_later: a flag indicating whether or not the script should wait for the results after the scripts are submitted :param input_type: type of the input of the script to be run remotely :param output_type: type of the output of the script to be run remotely :return: """ config_tosave = {config_name: {}} config_tosave[config_name].update({"proc_num": proc_num, "download_proc_num": download_proc_num, "download-later": download_later, "input-type": input_type, "output-type": output_type}) self.vcluster_config.deep_set(['runtime-config'], config_tosave) print("Runtime-configuration created/replaced successfully.") def _create_vcluster(self, vcluster_name, cluster_list=(), computer_list=()): """ This method is used to create a virutal cluster :param vcluster_name: name of the virtual cluster :param cluster_list: list of the clusters to be added to the virutal cluster :param computer_list: list of the computers to be used from the previous parameter (cluster_list). If the computer_list is left empty, all of the computers will be used :return: """ vcluster_tosave = {vcluster_name: {}} for cluster in cluster_list: for computer in self.cm_config.get('cluster.{}'.format(cluster)): if computer in computer_list or computer_list == '': vcluster_tosave[vcluster_name].update({computer: dict(self.cm_config.get('cluster.{}.{}'.format( cluster, computer)))}) self.vcluster_config.deep_set(['virtual-cluster'], vcluster_tosave) print("Virtual cluster created/replaced successfully.") @staticmethod def _execute_in_parallel(func_args): """ This is a method used for running methods in parallel :param func_args: :return: """ target_class = func_args[0] method_to_call = getattr(target_class, func_args[1]) args = list(func_args[2:]) return method_to_call(*args) def _fetch_results_in_parallel(self, job_metadata, node_pid_tuple, all_pids): """ This method is used to fetch the results from remote nodes. :param job_metadata: the dictionary containing the information about the previously submitted job :param node_pid_tuple: the tuple containing destination node, destination pid and destination node index when the job was submitted :param all_pids: :return: """ dest_node = node_pid_tuple[0] dest_pid = node_pid_tuple[1] node_idx = node_pid_tuple[2] dest_node_info = self.virt_cluster[dest_node] ssh_caller = lambda *x: self._ssh(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \ ['sshconfigpath']), *x) scp_caller = lambda *x: self._scp(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \ ['sshconfigpath']), *x) ps_output = ssh_caller('ps', '-ef', '|', 'grep', dest_pid.strip('\n'), '|', 'grep -v grep') if len(ps_output) == 0 and node_pid_tuple in [pid for pid in all_pids]: if not os.path.exists(job_metadata['local_path']): os.makedirs(job_metadata['local_path']) if self.runtime_config['output-type'] == 'stdout': scp_caller('%s:%s' % (dest_node_info['name'], os.path.join(job_metadata['remote_path'], self.add_suffix_to_path('outputfile_%d' % node_idx,job_metadata['suffix']))), os.path.join(job_metadata['local_path'], '')) elif self.runtime_config['output-type'] in ['file', 'stdout+file']: nested_remote_path = os.path.join(job_metadata['remote_path'], 'run{}'.format(node_idx)) scp_caller('-r', '%s:%s' % (dest_node_info['name'], nested_remote_path), os.path.join(job_metadata \ [ 'local_path'], '')) all_pids.remove((dest_node, dest_pid, node_idx)) print("Results collected from %s." % dest_node) def _run_remote_job_in_parallel(self, job_metadata, param_idx, params, all_pids): """ This method is used to spawn remote processes in parallel :param job_metadata: contains the information about the job :param param_idx: index of the parameters inputted as argument :param params: the parameters inputted as argument for this run :param all_pids: the manager used to take all pids of all submitted jobs :return: """ available_nodes_num = len(list(self.virt_cluster.keys())) target_node_idx = param_idx % available_nodes_num target_node_key = list(self.virt_cluster.keys())[target_node_idx] target_node = self.virt_cluster[target_node_key] remote_pid = [] ssh_caller = lambda *x: self._ssh(target_node['name'], os.path.expanduser(target_node['credentials'] \ ['sshconfigpath']), *x) scp_caller = lambda *x: self._scp(target_node['name'], os.path.expanduser(target_node['credentials'] \ ['sshconfigpath']), *x) # directory_check = ssh_caller('if test -d %s; then echo "exist"; fi' % job_metadata['remote_path']) # if len(directory_check) == 0: ssh_caller('cd %s && mkdir job%s' % (job_metadata['raw_remote_path'], job_metadata['suffix']) , True) if self.runtime_config['output-type'].lower() in ['file', 'stdout+file']: ssh_caller("cd {} && mkdir run{}".format(job_metadata['remote_path'], param_idx)) nested_remote_path = os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx), job_metadata['script_name_with_suffix']) scp_caller(job_metadata['script_path'], '%s:%s' % (target_node['name'], nested_remote_path)) ssh_caller('chmod +x', nested_remote_path) if self.runtime_config['input-type'].lower() == 'params+file': scp_caller(job_metadata['argfile_path'], '%s:%s' % (target_node['name'], os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx), job_metadata['argfile_name']))) else: scp_caller(job_metadata['script_path'], '%s:%s' % (target_node['name'], job_metadata['remote_script_path'])) ssh_caller('chmod +x', job_metadata['remote_script_path']) if self.runtime_config['input-type'].lower() == 'params+file': scp_caller(job_metadata['argfile_path'], '%s:%s' % (target_node['name'], job_metadata['remote_path'])) if self.runtime_config['output-type'].lower() == 'stdout': remote_pid = ssh_caller( 'cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % (job_metadata['remote_path'], job_metadata['remote_script_path'], params, os.path.join(job_metadata['remote_path'], self.add_suffix_to_path('outputfile_%d' % \ param_idx, job_metadata['suffix'])))) elif self.runtime_config['output-type'].lower() == 'stdout+file': remote_pid = ssh_caller('cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % \ (os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx)), os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx), job_metadata['script_name_with_suffix']), params, os.path.join( job_metadata['remote_path'], 'run{}'.format(param_idx), self.add_suffix_to_path('outputfile_%d' % param_idx, job_metadata['suffix'])))) elif self.runtime_config['output-type'].lower() == 'file': remote_pid = ssh_caller('cd %s && nohup ./%s %s >&- & echo $!' % (os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx)), job_metadata['script_name_with_suffix'], params)) all_pids.append((target_node_key, remote_pid, param_idx)) print('Remote Pid on %s: %s' % (target_node_key, remote_pid.strip('\n'))) @staticmethod def _ssh(hostname, sshconfigpath, *args): """ This method is used to create remove ssh connections :param hostname: hostname :param sshconfigpath: path to sshconfig for connecting to remote node :param args: the argument to be submitted via ssh :return: """ hide_errors_flag = False if type(args[-1]) == bool: hide_errors_flag = True args=args[:-1] ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = ssh.stdout.readline() if not result: error = ssh.stderr.readlines() if len(error) > 0 and hide_errors_flag == False: print("ERROR in host %s: %s" % (hostname, error)) return [] else: try: return ''.join([chr(x) for x in result]) except AttributeError: return [result.decode('utf-8').strip('\n')] @staticmethod def _scp(hostname, sshconfigpath, *args): """ This method is used for scp from and to remote :param hostname: hostname :param sshconfigpath: ssh config file :param args:arguments for using while copying :return: """ ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE) middle_result = ssh.stdout.readlines() if not middle_result: error = ssh.stderr.readlines() if len(error) > 0: print("ERROR in host %s: %s" % (hostname, error)) @staticmethod def add_suffix_to_path(path, suffix): """ This method is used to add suffix to a path :param path: path :param suffix: suffix :return: """ dir_path = os.path.dirname(path) full_filename = os.path.basename(path) filename, fileextention = os.path.splitext(full_filename) full_filename_new = filename + suffix + fileextention new_path = os.path.join(dir_path, full_filename_new) return new_path def clean_remote(self, job_name, proc_num): """ This method is used to spawn processes for cleaning the remote nodes :param job_name: name of previously submitted job for which the nodes are going to be cleaned :param proc_num: number of processes used for cleaning the remote nodes :return: """ job_metadata = self.vcluster_config.get('job-metadata')[job_name] self.virt_cluster = self.vcluster_config.get('virtual-cluster')[job_metadata['cluster_name']] remote_path = job_metadata['remote_path'] all_jobs = [(self, '_clean_remote_in_parallel', node, remote_path) for node in list(self.virt_cluster)] pool = Pool(processes=proc_num) pool.map(self._execute_in_parallel, all_jobs) def connection_test(self, vcluster_name, proc_num): """ This method is used for spawning processes for testing the connections to remote nodes of a vcluster :param vcluster_name: name of the virtual cluster the nodes of which are going to be tested :param proc_num: number of processes used for testing the remote nodes :return: """ self.virt_cluster = self.vcluster_config.get('virtual-cluster')[vcluster_name] all_jobs = [(self, '_connection_test_in_parallel', node) for node in list(self.virt_cluster)] pool = Pool(processes=proc_num) pool.map(self._execute_in_parallel, all_jobs) def create(self, *args, **kwargs): """ This is a caller for creator functions including config creator and vcluster creator :param args: :param kwargs: :return: """ if len(args) > 5: self._create_config(*args) else: self._create_vcluster(*args, **kwargs) def destroy(self, target, key): """ Used to remove virtual clusters and runtime configs :param target: type of entity to be removed :param key: keyname of the entity to be removed :return: """ if target == 'virtual-cluster': self.vcluster_config.remove(['virtual-cluster'], key) print("Virtual-cluster {} destroyed successfully.".format(key)) elif target == 'runtime-config': self.vcluster_config.remove(['runtime-config'], key) print("Runtime-configuration {} destroyed successfully.".format(key)) else: raise ValueError("Target of destroying not found.") def fetch(self, job_name): """ This method is used to fetch resutls from remote nodes :param job_name: the previously submitted job name :return: """ job_metadata = self.vcluster_config.get('job-metadata')[job_name] self.virt_cluster = self.vcluster_config.get('virtual-cluster')[job_metadata['cluster_name']] self.runtime_config = self.vcluster_config.get('runtime-config')[job_metadata['config_name']] loaded_all_pids = [tuple(x) for x in job_metadata['nodes-pids']] all_pids = Manager().list() all_pids.extend(loaded_all_pids) pool = Pool(processes=self.runtime_config['download_proc_num']) print("collecting results") while len(all_pids) > 0: time.sleep(1) all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, node_pid_tuple, all_pids) for \ node_pid_tuple in loaded_all_pids if node_pid_tuple in all_pids] pool.map(self._execute_in_parallel, all_running_jobs) print("waiting for other results if any...") print("All of the remote results collected.") def list(self, target, max_depth, current_depth=1, input_dict=None): """ listing the current virtual clusters based on the vcluster_conf file. :param target: name of the virtual cluster to be listed :param max_depth: depth of information to be shown :param input_dict: used for recursion for depth of higher than 1 :param current_depth: current depth of printing information :return: """ if target == 'virtual-clusters' and input_dict is None: input_dict = self.vcluster_config.get('virtual-cluster') elif target == 'runtime-configs' and input_dict is None: input_dict = self.vcluster_config.get('runtime-config') elif input_dict is None: raise ValueError("Target of listing not found.") if max_depth >= current_depth: if type(input_dict) == dict: for key in input_dict: key_to_print = key + ':' if max_depth >= current_depth else key indent = current_depth if current_depth > 1 else current_depth - 1 print('\t' * indent, key_to_print) if type(input_dict.get(key)) != dict: print('\t' * (indent + 1), input_dict.get(key)) else: for value in input_dict.get(key): value_to_print = value + ':' if max_depth > current_depth else value print('\t' * (indent + 1), value_to_print) self.list(target, max_depth, input_dict=input_dict[key][value], current_depth=current_depth + 1) else: indent = current_depth if current_depth > 1 else current_depth - 1 print('\t' * indent, input_dict) def run(self, job_name, cluster_name, config_name, script_path, argfile_path, outfile_name, remote_path, local_path, params_list, suffix, overwrite): """ This method is used to create a job, validate it and run it on remote nodes :param job_name: name of the job to create :param cluster_name: cluster on which the job is gonna run :param config_name: name of the configuration based on which the job is going to run :param script_path: path of the script to be run remotely :param argfile_path: path of the file that has to be passed to the file as an argument if any :param outfile_name: ouput filename resulted from running the script , if any :param remote_path: path in the remotes on which the script is gonna be copied to and ran from :param local_path: local path to which the results are gonna be copied :param params_list: list of the parameters that are going to be passed to the script if any :param suffix: suffix of the filenames in the job :param overwrite: if the job already exists, this flag overwrites the previous job with the same name :return: """ if params_list is None: raise ValueError('param-list is not set. This value determines how many instance of the target application \ will run remotely. Therefore, even if the parameter is empty, add commas for every run you expect.') if self.vcluster_config.get('job-metadata') is not None and job_name in \ list(self.vcluster_config.get('job-metadata').keys()) and overwrite is False: raise RuntimeError("The job {} exists in the configuration file, if you want to overwrite the job, \ use --overwrite argument.".format(job_name)) self.virt_cluster = self.vcluster_config.get('virtual-cluster')[cluster_name] self.runtime_config = self.vcluster_config.get('runtime-config')[config_name] job_metadata = {job_name: {}} job_metadata[job_name]['suffix'] = suffix job_metadata[job_name]['cluster_name'] = cluster_name job_metadata[job_name]['config_name'] = config_name job_metadata[job_name]['raw_remote_path'] = remote_path job_metadata[job_name]['script_path'] = os.path.abspath(script_path) job_metadata[job_name]['argfile_path'] = argfile_path job_metadata[job_name]['argfile_name'] = ntpath.basename(argfile_path) if len(job_metadata[job_name]['argfile_name']) > 0: job_metadata[job_name]['params_list'] = ['{} {}'.format(job_metadata[job_name]['argfile_name'], x) \ for x in params_list] else: job_metadata[job_name]['params_list'] = params_list job_metadata[job_name]['outfile_name'] = outfile_name job_metadata[job_name]['script_name'] = ntpath.basename(script_path) job_metadata[job_name]['script_name_with_suffix'] = self.add_suffix_to_path(job_metadata[job_name] \ ['script_name'], suffix) job_metadata[job_name]['remote_path'] = os.path.join(remote_path, 'job' + suffix, '') job_metadata[job_name]['remote_script_path'] = os.path.join(job_metadata[job_name]['remote_path'], job_metadata[job_name]['script_name_with_suffix']) job_metadata[job_name]['local_path'] = local_path self.job_metadata = job_metadata[job_name] self._config_validator() self.vcluster_config.deep_set(['job-metadata'], job_metadata) all_pids = Manager().list() all_jobs = [(self, '_run_remote_job_in_parallel', job_metadata[job_name], param_idx, param, all_pids) for \ param_idx, param in enumerate(job_metadata[job_name]['params_list'])] pool = Pool(processes=self.runtime_config['proc_num']) pool.map(self._execute_in_parallel, all_jobs) self.all_pids = all_pids self.vcluster_config.deep_set(['job-metadata', job_name, 'nodes-pids'], [pid for pid in all_pids]) if not self.runtime_config['download-later']: pool = Pool(processes=self.runtime_config['download_proc_num']) print("collecting results") while len(all_pids) > 0: time.sleep(3) all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata[job_name], node_pid_tuple, all_pids) for node_pid_tuple in \ job_metadata[job_name]['nodes-pids'] if node_pid_tuple in all_pids] pool.map(self._execute_in_parallel, all_running_jobs) print("waiting for other results if any...") print("All of the remote results collected.") def set_param(self, target, name, parameter, value): """ Used to set a specific parameter in the configuration :param target: the entity type on which the parameter is going to be set, e.g. runtime-config :param name: the entity name on which the parameter is going to be set, e.g. test-config32 :param parameter: name of the parameter to be set :param value: value of that parameter to be set :return: """ if target == 'virtual-cluster': self.vcluster_config.deep_set(['virtual-cluster', name, parameter], value) print("Virtual-cluster parameter {} set to {} successfully.".format(parameter, value)) elif target == 'runtime-config': self.vcluster_config.deep_set(['runtime-config', name, parameter], value) print("Runtime-configuration parameter {} set to {} successfully.".format(parameter, value)) else: raise ValueError("Target of variable set not found.")
class Data(object): def __init__(self): self._db = None self._conf = {} self._providers = {} def config(self, config_path='~/.cloudmesh/cloudmesh4.yaml'): """ Use `cloudmesh4.yaml` file to configure. """ self._conf = Config(config_path).get("data") # Set DB provider. There should only be one. db_provider = self._conf.get('default.db') if db_provider == 'local': db_path = self._conf.get('db.local.CMDATA_DB_FOLDER') self._db = LocalDBProvider(db_path) # Check for local storage provider. storage_path = self._conf.get('service.local.CMDATA_STORAGE_FOLDER') if storage_path: self._providers['local'] = LocalStorageProvider(storage_path) # Check for Azure provider. az_conf = self._conf.get('service.azure') if az_conf: az_act = az_conf.get('credentials.AZURE_STORAGE_ACCOUNT') az_key = az_conf.get('credentials.AZURE_STORAGE_KEY') az_container = az_conf.get('container') if az_act and az_key: self._providers['azure'] = AzureStorageProvider(az_act, az_key, az_container) # Set a default storage provider. default_storage_provider = self._conf.get('default.service') self._providers['default'] = self._providers[default_storage_provider] def ls(self): """ List tracked files. :return: A list of CloudFiles """ files = self._db.list_files() self._print_row("FILE", "SERVICE", "SIZE", "URL") for f in files: self._print_row(f.name, f.service, f.size, f.url) return files def add(self, provider, file_path): """ Add a new file :param provider: The storage provider where the file should be stored. :param file_path: The local path to the file. """ new_cloud_file = self._providers[provider or 'default'].add(file_path) self._db.add(new_cloud_file) return new_cloud_file def get(self, file_name, dest_folder='.'): """ Retrieve a file :param file_name: The name corresponding to the cloud file to be downloaded. :param dest_folder: :return: """ # Get db entry for this file cloud_file = self._db.get(file_name) if not cloud_file: print("Requested file not found. Use `ls` to see a list of file names.") raise SystemExit # Todo: docopt default for this? dest_folder = dest_folder or '.' self._providers[cloud_file.service].get(cloud_file, dest_folder) def delete(self, file_name): """ Remove a file :param file_name: The name of the file to remove. """ cloud_file = self._db.get(file_name) if cloud_file is None: raise Exception(f"{file_name} not found in the database.") self._providers[cloud_file.service].delete(cloud_file) self._db.delete(cloud_file) @staticmethod def _print_row(file_name, service, size, url): """ Print a formatted row """ print(" %-35s %-10s %-10s %-50s" % (file_name, service, size, url))