示例#1
0
class Driver(object):
    def __init__(self, name="~/.cloudmesh/cloudmesh.yaml"):
        name = os.path.expanduser(name)
        self.config = Config(name=name)

    # noinspection PyPep8Naming
    def get(self, name=None):
        connection = None

        if name is None:
            variables = Variables()
            # noinspection PyUnusedLocal
            cloudname = variables['cloud']

        kind = self.config.get(
            "cloudmesh.cloud.{name}.cm.kind".format(name=name))
        credentials = self.config.get(
            "cloudmesh.cloud.{name}.credentials".format(name=name))

        # BUG FROM HERE ON WRONG

        if kind == 'azure':
            AZDriver = get_driver(Provider.AZURE)
            connection = AZDriver(
                subscription_id=credentials['AZURE_SUBSCRIPTION_ID'],
                key_file=credentials['AZURE_MANAGEMENT_CERT_PATH'])
        elif kind == 'aws':
            EC2Driver = get_driver(Provider.EC2)
            connection = EC2Driver(credentials['EC2_ACCESS_ID'],
                                   credentials['EC2_SECRET_KEY'])

        return connection
    def test_doesnotexist_get(self):
        config = Config()

        key = "cloudmesh.doesnotexist"
        StopWatch.start(f"not exists get({key})")
        value = config.get(key, default="Hallo")
        StopWatch.stop(f"not exists get({key})")

        assert value == "Hallo"
    def __init__(self, cloud):
        """
        Initialize the provider for the yaml file
        """
        config = Config()
        cred = config.get("cloud.azure.credentials")

        self.defaults = config.get("cloud.azure.default")
        self.resource_group = self.defaults["resource_group"]
        self.subscription_id = cred["AZURE_TENANT_ID"]

        cls = get_driver(LibCloudProvider.AZURE_ARM)
        self.api_version = {"api-version": "2018-08-01"}

        self.provider = cls(
            tenant_id=cred["AZURE_TENANT_ID"],
            subscription_id=cred["AZURE_SUBSCRIPTION_ID"],
            key=cred["AZURE_APPLICATION_ID"],
            secret=cred["AZURE_SECRET_KEY"],
            region=cred["AZURE_REGION"]
        )
示例#4
0
class Data(object):
    def __init__(self):
        self._db = None
        self._conf = {}
        self._providers = {}

    def config(self, config_path='~/.cloudmesh/cloudmesh.yaml'):
        """
        Use `cloudmesh.yaml` file to configure.
        """
        self._conf = Config(config_path).get("data")

        # Set DB provider. There should only be one.
        db_provider = self._conf.get('default.db')

        if db_provider == 'local':
            db_path = self._conf.get('db.local.CMDATA_DB_FOLDER')
            self._db = LocalDBProvider(db_path)

        # Check for local storage provider.
        storage_path = self._conf.get('service.local.CMDATA_STORAGE_FOLDER')
        if storage_path:
            self._providers['local'] = LocalStorageProvider(storage_path)

        # Check for Azure provider.
        az_conf = self._conf.get('service.azure')
        if az_conf:
            az_act = az_conf.get('credentials.AZURE_STORAGE_ACCOUNT')
            az_key = az_conf.get('credentials.AZURE_STORAGE_KEY')
            az_container = az_conf.get('container')
            if az_act and az_key:
                self._providers['azure'] = AzureStorageProvider(
                    az_act, az_key, az_container)

        # Set a default storage provider.
        default_storage_provider = self._conf.get('default.service')
        self._providers['default'] = self._providers[default_storage_provider]

    def ls(self):
        """
        List tracked files.

        :return: A list of CloudFiles
        """
        files = self._db.list_files()

        self._print_row("FILE", "SERVICE", "SIZE", "URL")

        for f in files:
            self._print_row(f.name, f.service, f.size, f.url)

        return files

    def add(self, provider, file_path):
        """
        Add a new file

        :param provider: The storage provider where the file should be stored.
        :param file_path: The local path to the file.
        """
        new_cloud_file = self._providers[provider or 'default'].put(file_path)
        self._db.put(new_cloud_file)
        return new_cloud_file

    def get(self, file_name, dest_folder='.'):
        """

        Retrieve a file

        :param file_name: The name corresponding to the cloud file to be downloaded.
        :param dest_folder:
        :return:
        """
        # Get db entry for this file
        cloud_file = self._db.get(file_name)

        if not cloud_file:
            print(
                "Requested file not found. Use `ls` to see a list of file names."
            )
            raise SystemExit

        # Todo: docopt default for this?
        dest_folder = dest_folder or '.'
        self._providers[cloud_file.service].get(cloud_file, dest_folder)

    def delete(self, file_name):
        """
        Remove a file

        :param file_name: The name of the file to remove.
        """
        cloud_file = self._db.get(file_name)

        if cloud_file is None:
            raise Exception(f"{file_name} not found in the database.")

        self._providers[cloud_file.service].delete(cloud_file)
        self._db.delete(cloud_file)

    @staticmethod
    def _print_row(file_name, service, size, url):
        """
        Print a formatted row
        """
        print(" %-35s %-10s %-10s %-50s" % (file_name, service, size, url))
示例#5
0
class Queue(object):
    def __init__(self):
        """
        Initializes the Queue class

        """

        self.cm_config = Config()
        self.info = munch.munchify({
            'uid': None,
            "cloud": None,
            "kind": "batch-queue",
            "name": None,
            "cm": {},
            "queue": {
                'policy': None,
                'status': None,
                'active': False,
                'charge': None,
                'unit': None,
                "numJobs": 0,
                "numRunningJobs": 0,
                'joblist': []
            }
        })
        # list of parameters that can be set
        self.settable_params = ['policy', 'charge', 'unit']
        self.database = CmDatabase()

    @DatabaseUpdate()
    def create(self, queue_name, cloud_name, policy, charge=None, unit=None):
        """
        This method is used to create a queue

        :param queue_name: name of the queue to create
        :param cloud_name: slurm cluster on which the job is gonna run
        :param policy: policy of the queue
        :param charge: charge of the queue
        :param unit: unit of the charge for the queue
        :return:
        """
        name = Name(order=["cloud", "name"], cloud=cloud_name, name=queue_name)
        uid = name.id(cloud=cloud_name, name=queue_name)
        # print(uid)

        self.info = munch.munchify({
            'uid': uid,
            "cloud": cloud_name,
            "kind": "batch-queue",
            "name": queue_name,
            "cm": {
                "cloud":
                cloud_name,
                "kind":
                "batch-queue",
                "name":
                queue_name,
                "cluster":
                self.cm_config.get('cloudmesh').get('cluster')[cloud_name]
            },
            "queue": {
                'policy': policy,
                'status': 'EMPTY',
                'active': False,
                'charge': charge,
                'unit': unit,
                "numJobs": 0,
                "numRunningJobs": 0,
            }
        })
        # Console.error(self.info)
        self.policyFunctionMap = munch.munchify({
            'FIFO': self.popFIFO,
            'FILO': self.popFILO
        })
        if self.database.exists(self.info)[0]:
            Console.error("Queue already exists")
            return
        return [self.info]

    def findQueue(self, cloud_name, queue_name):
        '''
        finds a queue in the database based on the name
        :param name: name of the queue
        :return:
        '''
        # if self.database.exists(self.info)[0]:
        #     Console.error("Queue already exists")
        name = Name(order=["cloud", "name"], cloud=cloud_name, name=queue_name)
        uid = name.id(cloud=cloud_name, name=queue_name)
        queue = self.database.find_by_KeyValue(
            collection_name="{cloud}-{kind}".format(cloud=cloud_name,
                                                    kind='batch-queue'),
            KeyValue={'uid': uid})
        if type(queue) is cursor.Cursor:
            self.info = munch.munchify(queue[0])
            return True  # # queue found
        elif type(queue) is list and len(queue) == 0:
            return False  # queue not found

    def findClouds(self):
        '''
        finds all queues in the database based on the name
        :return:
        '''
        for collection in self.database.collections():
            if 'batch-queue' in collection:
                print(collection)
                # all_queues = self.database.db.find()
                # print(all_queues)

    def findQueues(self, cloud_name):
        '''
        finds all queues in the database based on the name
        :return:
        '''
        # TODO: find all queues info from the DB based on the ['cm']
        all_queues = self.database.find_by_KeyValue(collection_name=cloud_name)
        all_queues = [munch.munchify(queue) for queue in all_queues]
        for queue in all_queues:
            print(queue.uid)

    def listJobs(self):
        '''
        list the jobs in the current queue
        :return:
        '''
        return

    def removeQueue(self):
        '''
        remove the queue from the database
        :return:
        '''
        # TODO: remove the queues info from the DB based on the ['cm']
        return

    @DatabaseUpdate()  # this should update the record not create a new one
    def push(self, job):
        '''
        push job to stack
        :param job:
        :return:
        '''
        self.info.queue.joblist.append(job)
        self.info.queue.numJobs += 1
        self.updateStatus()
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def pop(self):
        '''
        pop job from stack based on the policy
        :param job:
        :return:
        '''

        self.info.queue.numJobs -= 1
        self.updateStatus()
        policy = self.info.queue.policy
        return self.policyFunctionMap[policy]()

    def popFIFO(self):
        '''
        pop job from stack based on FIFO policy
        :param job:
        :return:
        '''
        return self.info['queue']['joblist'].pop(0)

    def popFILO(self):
        '''
        pop job from stack based on FIFO policy
        :param job:
        :return:
        '''
        return self.info['queue']['joblist'].pop()

    def isEmpty(self):
        '''
        checks if the queue is empty
        :return:
        '''
        if self.info.queue.numJobs > 0:
            return False
        return True

    @DatabaseUpdate()  # this should update the record not create a new one
    def activate(self):
        '''
        activates the queue

        :return:
        '''
        # TODO: activating a queue should start submitting jobs
        self.info.queue.active = True
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def deactivate(self):
        '''
        deactivates the queue
        :return:
        '''
        # TODO: stop all jobs
        self.info.queue.active = False
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def updateStatus(self):
        '''
        checks number of jobs and updates queue status
        :return:
        '''
        if self.info.queue.numJobs > 0:
            self.info.queue.status = 'FULL'
        return self.info

    @DatabaseUpdate()
    def setParam(self, param, val):
        '''
        set a particular parameter in the queue
        :param param: the parameter
        :param val:  value of the parameter
        :return:
        '''
        if param in self.settable_params:
            self.info.queue[param] = val
        else:
            Console.error("Only the following parameters could be set in a "
                          "queue: \n" + ', '.join(self.settable_params))
        return self.info
""" 
config_get.py
Demonstrate that the config.get() does not work in all cases
"""
from cloudmesh.common.console import Console
from cloudmesh.configuration.Config import Config
from cloudmesh.common.util import banner
from pprint import pprint
from cloudmesh.common.debug import VERBOSE

config = Config()

p = config["cloudmesh.profile"]
q = config.get("cloudmesh.profile")

pprint(p)
pprint(q)

#########################################################################
banner("Demonstration Using get()", color="BLUE")

for path in ["cloudmesh.version",
             "cloudmesh.profile"]:  # default.group was not in my yaml
    Console.ok(f"Test Path: {path}")
    for value in [config.get(path), config[path]]:
        Console.ok(f"res type: {type(value)}")
        VERBOSE(value)

for path in ["cloudmesh.doesnotexist"]:
    Console.ok(f"Test Path: {path}")
    try:
示例#7
0
class VirtualCluster(object):
    def __init__(self, debug):
        """
        Initializes the virtualcluster class

        :param debug: switch the debug information on and off
        """
        current_path = os.path.dirname(os.path.realpath(__file__))
        self.workspace = os.path.join(current_path,
                                      "vcluster_workspace/now.yaml")
        if not os.path.exists(os.path.dirname(self.workspace)):
            os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        self.vcluster_config = GenericConfig(self.workspace)
        self.debug = debug
        self.all_pids = []
        self.virt_cluster = {}
        self.runtime_config = {}
        self.job_metadata = {}

    def _config_validator(self):
        """
        validates the configuration of a run based on the information about its virtual cluster, runtime configuration
        and the job metadata

        :return:
        """
        job_metadata = self.job_metadata
        virt_cluster = self.virt_cluster
        runtime_config = self.runtime_config
        for node in virt_cluster:
            if 'name' not in virt_cluster[node].keys():
                raise ValueError(
                    " node {}: 'name' keyword, indicating hostname is missing from"
                    .format(node))
            if 'sshconfigpath' not in virt_cluster[node]['credentials'].keys():
                raise ValueError("%s: 'sshconfigpath' keyword is missing" %
                                 node)
            if not os.path.isfile(
                    os.path.expanduser(
                        virt_cluster[node]['credentials']['sshconfigpath'])):
                raise ValueError(
                    "%s: The ssh config file %s does not exists" % (
                        node, virt_cluster[node] \
                            ['credentials']['sshconfigpath']))
        if not os.path.isfile(os.path.expanduser(job_metadata['script_path'])):
            raise ValueError("The script file %s does not exists" %
                             (job_metadata['script_path']))
        if runtime_config['input-type'] == 'params+file':
            if not os.path.isfile(
                    os.path.expanduser(job_metadata['argfile_path'])):
                raise ValueError("The arg file %s does not exists" %
                                 (job_metadata['arg_file_path']))

    def _clean_remote_in_parallel(self, target_node, remote_path):
        """
        This method is used to spawn processes to clean the remotes of a particular job.

        :param target_node: the node on which the data is going to be removed
        :param remote_path: path of the data to be removed
        :return:
        """
        target_node_info = self.virt_cluster[target_node]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'],
                                          os.path.expanduser(
                                              target_node_info['credentials'] \
                                                  ['sshconfigpath']), *x)
        ssh_caller('rm -rf {}'.format(remote_path))
        if len(ssh_caller('ls {}'.format(remote_path))) == 0:
            print("Node {} cleaned successfully.".format(target_node))
        else:
            print("Error: Node {} could not be cleaned.".format(target_node))

    def _connection_test_in_parallel(self, target_node):
        """
        This method is used to test the connection to cluster nodes in parallel

        :param target_node: the node to which the connection is going to be tested
        :return:
        """
        target_node_info = self.virt_cluster[target_node]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'],
                                          os.path.expanduser(
                                              target_node_info['credentials'] \
                                                  ['sshconfigpath']), *x)
        if len(ssh_caller('uname -a')) > 0:
            print("Node {} is accessible.".format(target_node))
        else:
            print("Error: Node {} cannot be accessed.".format(target_node))

    def _create_config(self, config_name, proc_num, download_proc_num,
                       download_later, input_type, output_type):
        """
        This method is used to create a runtime-configuration.

        :param config_name: name of the runtime configuration
        :param proc_num: number of processes to be spawned in that runtime for submitting the jobs
        :param download_proc_num: number number of processes to be spawned in that runtime for fetching the results
        :param download_later: a flag indicating whether or not the script should wait for the results after the scripts are submitted
        :param input_type: type of the input of the script to be run remotely
        :param output_type: type of the output of the script to be run remotely
        :return:
        """
        config_tosave = {config_name: {}}
        config_tosave[config_name].update({
            "proc_num": proc_num,
            "download_proc_num": download_proc_num,
            "download-later": download_later,
            "input-type": input_type,
            "output-type": output_type
        })
        self.vcluster_config.deep_set(['runtime-config'], config_tosave)
        print("Runtime-configuration created/replaced successfully.")

    def _create_vcluster(self,
                         vcluster_name,
                         cluster_list=(),
                         computer_list=()):
        """
        This method is used to create a virtual cluster

        :param vcluster_name: name of the virtual cluster
        :param cluster_list: list of the clusters to be added to the
               virtual cluster
        :param computer_list: list of the computers to be used from the
               previous parameter (cluster_list). If the computer_list is left
               empty, all of the computers will be used
        :return:
        """
        vcluster_tosave = {vcluster_name: {}}
        for cluster in cluster_list:
            for computer in self.cm_config.get('cluster.{}'.format(cluster)):
                if computer in computer_list or computer_list == '':
                    vcluster_tosave[vcluster_name].update({
                        computer:
                        dict(
                            self.cm_config.get('cluster.{}.{}'.format(
                                cluster, computer)))
                    })
        self.vcluster_config.deep_set(['virtual-cluster'], vcluster_tosave)
        print("Virtual cluster created/replaced successfully.")

    @staticmethod
    def _execute_in_parallel(func_args):
        """
        This is a method used for running methods in parallel

        :param func_args:
        :return:
        """
        target_class = func_args[0]
        method_to_call = getattr(target_class, func_args[1])
        args = list(func_args[2:])
        return method_to_call(*args)

    def _fetch_results_in_parallel(self, job_metadata, node_pid_tuple,
                                   all_pids):
        """
        This method is used to fetch the results from remote nodes.

        :param job_metadata: the dictionary containing the information about the previously submitted job
        :param node_pid_tuple: the tuple containing destination node, destination pid and destination node index when the job was submitted
        :param all_pids:
        :return:
        """
        dest_node = node_pid_tuple[0]
        dest_pid = node_pid_tuple[1]
        node_idx = node_pid_tuple[2]
        dest_node_info = self.virt_cluster[dest_node]
        ssh_caller = lambda *x: self._ssh(dest_node_info['name'],
                                          os.path.expanduser(
                                              dest_node_info['credentials'] \
                                                  ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(dest_node_info['name'],
                                          os.path.expanduser(
                                              dest_node_info['credentials'] \
                                                  ['sshconfigpath']), *x)
        ps_output = ssh_caller('ps', '-ef', '|', 'grep', dest_pid.strip('\n'),
                               '|', 'grep -v grep')
        if len(ps_output) == 0 and node_pid_tuple in [pid for pid in all_pids]:
            if not os.path.exists(job_metadata['local_path']):
                os.makedirs(job_metadata['local_path'])
            if self.runtime_config['output-type'] == 'stdout':
                scp_caller(
                    '%s:%s' %
                    (dest_node_info['name'],
                     os.path.join(
                         job_metadata['remote_path'],
                         self.add_suffix_to_path('outputfile_%d' % node_idx,
                                                 job_metadata['suffix']))),
                    os.path.join(job_metadata['local_path'], ''))
            elif self.runtime_config['output-type'] in ['file', 'stdout+file']:
                nested_remote_path = os.path.join(job_metadata['remote_path'],
                                                  'run{}'.format(node_idx))
                scp_caller('-r', '%s:%s' % (
                    dest_node_info['name'], nested_remote_path),
                           os.path.join(job_metadata \
                                            [
                                            'local_path'],
                                        ''))
            all_pids.remove((dest_node, dest_pid, node_idx))
            print("Results collected from %s." % dest_node)

    def _run_remote_job_in_parallel(self, job_metadata, param_idx, params,
                                    all_pids):
        """
        This method is used to spawn remote processes in parallel

        :param job_metadata: contains the information about the job
        :param param_idx: index of the parameters inputted as argument
        :param params: the parameters inputted as argument for this run
        :param all_pids: the manager used to take all pids of all submitted jobs
        :return:
        """
        available_nodes_num = len(list(self.virt_cluster.keys()))
        target_node_idx = param_idx % available_nodes_num
        target_node_key = list(self.virt_cluster.keys())[target_node_idx]
        target_node = self.virt_cluster[target_node_key]
        remote_pid = []
        ssh_caller = lambda *x: self._ssh(target_node['name'],
                                          os.path.expanduser(
                                              target_node['credentials'] \
                                                  ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(target_node['name'],
                                          os.path.expanduser(
                                              target_node['credentials'] \
                                                  ['sshconfigpath']), *x)

        # directory_check = ssh_caller('if test -d %s; then echo "exist"; fi' % job_metadata['remote_path'])
        # if len(directory_check) == 0:
        ssh_caller(
            'cd %s && mkdir job%s' %
            (job_metadata['raw_remote_path'], job_metadata['suffix']), True)
        if self.runtime_config['output-type'].lower() in [
                'file', 'stdout+file'
        ]:
            ssh_caller("cd {} && mkdir run{}".format(
                job_metadata['remote_path'], param_idx))
            nested_remote_path = os.path.join(
                job_metadata['remote_path'], 'run{}'.format(param_idx),
                job_metadata['script_name_with_suffix'])
            scp_caller(job_metadata['script_path'],
                       '%s:%s' % (target_node['name'], nested_remote_path))
            ssh_caller('chmod +x', nested_remote_path)
            if self.runtime_config['input-type'].lower() == 'params+file':
                scp_caller(
                    job_metadata['argfile_path'],
                    '%s:%s' % (target_node['name'],
                               os.path.join(job_metadata['remote_path'],
                                            'run{}'.format(param_idx),
                                            job_metadata['argfile_name'])))
        else:
            scp_caller(
                job_metadata['script_path'], '%s:%s' %
                (target_node['name'], job_metadata['remote_script_path']))
            ssh_caller('chmod +x', job_metadata['remote_script_path'])
            if self.runtime_config['input-type'].lower() == 'params+file':
                scp_caller(
                    job_metadata['argfile_path'], '%s:%s' %
                    (target_node['name'], job_metadata['remote_path']))

        if self.runtime_config['output-type'].lower() == 'stdout':
            remote_pid = ssh_caller(
                'cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % (
                    job_metadata['remote_path'],
                    job_metadata['remote_script_path'],
                    params,
                    os.path.join(job_metadata['remote_path'],
                                 self.add_suffix_to_path(
                                     'outputfile_%d' % \
                                     param_idx,
                                     job_metadata['suffix']))))
        elif self.runtime_config['output-type'].lower() == 'stdout+file':
            remote_pid = ssh_caller(
                'cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % \
                (os.path.join(job_metadata['remote_path'],
                              'run{}'.format(param_idx)),
                 os.path.join(job_metadata['remote_path'],
                              'run{}'.format(param_idx),
                              job_metadata['script_name_with_suffix']), params,
                 os.path.join(
                     job_metadata['remote_path'], 'run{}'.format(param_idx),
                     self.add_suffix_to_path('outputfile_%d' % param_idx,
                                             job_metadata['suffix']))))
        elif self.runtime_config['output-type'].lower() == 'file':
            remote_pid = ssh_caller(
                'cd %s && nohup ./%s %s >&- & echo $!' %
                (os.path.join(job_metadata['remote_path'],
                              'run{}'.format(param_idx)),
                 job_metadata['script_name_with_suffix'], params))
        all_pids.append((target_node_key, remote_pid, param_idx))
        print('Remote Pid on %s: %s' %
              (target_node_key, remote_pid.strip('\n')))

    @staticmethod
    def _ssh(hostname, sshconfigpath, *args):
        """
        This method is used to create remove ssh connections

        :param hostname: hostname
        :param sshconfigpath: path to sshconfig for connecting to remote node
        :param args: the argument to be submitted via ssh
        :return:
        """
        hide_errors_flag = False
        if type(args[-1]) == bool:
            hide_errors_flag = True
            args = args[:-1]
        ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        result = ssh.stdout.readline()
        if not result:
            error = ssh.stderr.readlines()
            if len(error) > 0 and hide_errors_flag == False:
                print("ERROR in host %s: %s" % (hostname, error))
            return []
        else:
            try:
                return ''.join([chr(x) for x in result])
            except AttributeError:
                return [result.decode('utf-8').strip('\n')]

    @staticmethod
    def _scp(hostname, sshconfigpath, *args):
        """
        This method is used for scp from and to remote

        :param hostname: hostname
        :param sshconfigpath: ssh config file
        :param args:arguments for using while copying
        :return:
        """
        ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        middle_result = ssh.stdout.readlines()
        if not middle_result:
            error = ssh.stderr.readlines()
            if len(error) > 0:
                print("ERROR in host %s: %s" % (hostname, error))

    @staticmethod
    def add_suffix_to_path(path, suffix):
        """
        This method is used to add suffix to a path

        :param path: path
        :param suffix: suffix
        :return:
        """
        dir_path = os.path.dirname(path)
        full_filename = os.path.basename(path)
        filename, fileextention = os.path.splitext(full_filename)
        full_filename_new = filename + suffix + fileextention
        new_path = os.path.join(dir_path, full_filename_new)
        return new_path

    def clean_remote(self, job_name, proc_num):
        """
        This method is used to spawn processes for cleaning the remote nodes

        :param job_name: name of previously submitted job for which the nodes are going to be cleaned
        :param proc_num: number of processes used for cleaning the remote nodes
        :return:
        """
        job_metadata = self.vcluster_config.get('job-metadata')[job_name]
        self.virt_cluster = self.vcluster_config.get('virtual-cluster')[
            job_metadata['cluster_name']]
        remote_path = job_metadata['remote_path']
        all_jobs = [(self, '_clean_remote_in_parallel', node, remote_path)
                    for node in list(self.virt_cluster)]
        pool = Pool(processes=proc_num)
        pool.map(self._execute_in_parallel, all_jobs)

    def connection_test(self, vcluster_name, proc_num):
        """
        This method is used for spawning processes for testing the connections to remote nodes of a vcluster

        :param vcluster_name: name of the virtual cluster the nodes of which are going to be tested
        :param proc_num: number of processes used for testing the remote nodes
        :return:
        """
        self.virt_cluster = self.vcluster_config.get(
            'virtual-cluster')[vcluster_name]
        all_jobs = [(self, '_connection_test_in_parallel', node)
                    for node in list(self.virt_cluster)]
        pool = Pool(processes=proc_num)
        pool.map(self._execute_in_parallel, all_jobs)

    def create(self, *args, **kwargs):
        """
        This is a caller for creator functions including config creator and vcluster creator

        :param args:
        :param kwargs:
        :return:
        """
        if len(args) > 5:
            self._create_config(*args)
        else:
            self._create_vcluster(*args, **kwargs)

    def destroy(self, target, key):
        """
        Used to remove virtual clusters and runtime configs

        :param target: type of entity to be removed
        :param key: keyname of the entity to be removed
        :return:
        """
        if target == 'virtual-cluster':
            self.vcluster_config.remove(['virtual-cluster'], key)
            print("Virtual-cluster {} destroyed successfully.".format(key))
        elif target == 'runtime-config':
            self.vcluster_config.remove(['runtime-config'], key)
            print(
                "Runtime-configuration {} destroyed successfully.".format(key))
        else:
            raise ValueError("Target of destroying not found.")

    def fetch(self, job_name):
        """
        This method is used to fetch results from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.vcluster_config.get('job-metadata')[job_name]
        self.virt_cluster = self.vcluster_config.get('virtual-cluster')[
            job_metadata['cluster_name']]
        self.runtime_config = self.vcluster_config.get('runtime-config')[
            job_metadata['config_name']]
        loaded_all_pids = [tuple(x) for x in job_metadata['nodes-pids']]
        all_pids = Manager().list()
        all_pids.extend(loaded_all_pids)
        pool = Pool(processes=self.runtime_config['download_proc_num'])
        print("collecting results")
        while len(all_pids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel',
                                 job_metadata, node_pid_tuple, all_pids) for \
                                node_pid_tuple in loaded_all_pids if
                                node_pid_tuple in all_pids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")

    def list(self, target, max_depth, current_depth=1, input_dict=None):
        """
        listing the current virtual clusters based on the vcluster_conf file.

        :param target: name of the virtual cluster to be listed
        :param max_depth: depth of information to be shown
        :param input_dict: used for recursion for depth of higher than 1
        :param current_depth: current depth of printing information
        :return:
        """
        if target == 'virtual-clusters' and input_dict is None:
            input_dict = self.vcluster_config.get('virtual-cluster')
        elif target == 'runtime-configs' and input_dict is None:
            input_dict = self.vcluster_config.get('runtime-config')
        elif input_dict is None:
            raise ValueError("Target of listing not found.")

        if max_depth >= current_depth:
            if type(input_dict) == dict:
                for key in input_dict:
                    key_to_print = key + ':' if max_depth >= current_depth else key
                    indent = current_depth if current_depth > 1 else current_depth - 1
                    print('\t' * indent, key_to_print)
                    if type(input_dict.get(key)) != dict:
                        print('\t' * (indent + 1), input_dict.get(key))
                    else:
                        for value in input_dict.get(key):
                            value_to_print = value + ':' if max_depth > current_depth else value
                            print('\t' * (indent + 1), value_to_print)
                            self.list(target,
                                      max_depth,
                                      input_dict=input_dict[key][value],
                                      current_depth=current_depth + 1)
            else:
                indent = current_depth if current_depth > 1 else current_depth - 1
                print('\t' * indent, input_dict)

    def run(self, job_name, cluster_name, config_name, script_path,
            argfile_path, outfile_name, remote_path, local_path, params_list,
            suffix, overwrite):
        """
        This method is used to create a job, validate it and run it on remote nodes

        :param job_name: name of the job to create
        :param cluster_name: cluster on which the job is gonna run
        :param config_name: name of the configuration based on which the job is going to run
        :param script_path: path of the script to be run remotely
        :param argfile_path: path of the file that has to be passed to the file as an argument if any
        :param outfile_name: output filename resulted from running the script , if any
        :param remote_path: path in the remotes on which the script is gonna be copied to and ran from
        :param local_path: local path to which the results are gonna be copied
        :param params_list: list of the parameters that are going to be passed to the script if any
        :param suffix: suffix of the filenames in the job
        :param overwrite: if the job already exists, this flag overwrites the previous job with the same name
        :return:
        """
        if params_list is None:
            raise ValueError(
                'param-list is not set. This value determines how many instance of the target application \
            will run remotely. Therefore, even if the parameter is empty, add commas for every run you expect.'
            )
        if self.vcluster_config.get('job-metadata') is not None and job_name in \
            list(self.vcluster_config.get(
                'job-metadata').keys()) and overwrite is False:
            raise RuntimeError(
                "The job {} exists in the configuration file, if you want to overwrite the job, \
            use --overwrite argument.".format(job_name))
        self.virt_cluster = self.vcluster_config.get(
            'virtual-cluster')[cluster_name]
        self.runtime_config = self.vcluster_config.get(
            'runtime-config')[config_name]
        job_metadata = {job_name: {}}
        job_metadata[job_name]['suffix'] = suffix
        job_metadata[job_name]['cluster_name'] = cluster_name
        job_metadata[job_name]['config_name'] = config_name
        job_metadata[job_name]['raw_remote_path'] = remote_path
        job_metadata[job_name]['script_path'] = os.path.abspath(script_path)
        job_metadata[job_name]['argfile_path'] = argfile_path
        job_metadata[job_name]['argfile_name'] = ntpath.basename(argfile_path)
        if len(job_metadata[job_name]['argfile_name']) > 0:
            job_metadata[job_name]['params_list'] = [
                '{} {}'.format(job_metadata[job_name]['argfile_name'], x) \
                for x in params_list]
        else:
            job_metadata[job_name]['params_list'] = params_list
        job_metadata[job_name]['outfile_name'] = outfile_name
        job_metadata[job_name]['script_name'] = ntpath.basename(script_path)
        job_metadata[job_name][
            'script_name_with_suffix'] = self.add_suffix_to_path(
            job_metadata[job_name] \
                ['script_name'], suffix)
        job_metadata[job_name]['remote_path'] = os.path.join(
            remote_path, 'job' + suffix, '')
        job_metadata[job_name]['remote_script_path'] = os.path.join(
            job_metadata[job_name]['remote_path'],
            job_metadata[job_name]['script_name_with_suffix'])
        job_metadata[job_name]['local_path'] = local_path
        self.job_metadata = job_metadata[job_name]
        self._config_validator()
        self.vcluster_config.deep_set(['job-metadata'], job_metadata)
        all_pids = Manager().list()
        all_jobs = [(
            self, '_run_remote_job_in_parallel', job_metadata[job_name],
            param_idx, param, all_pids) for \
            param_idx, param in
            enumerate(job_metadata[job_name]['params_list'])]
        pool = Pool(processes=self.runtime_config['proc_num'])
        pool.map(self._execute_in_parallel, all_jobs)
        self.all_pids = all_pids
        self.vcluster_config.deep_set(['job-metadata', job_name, 'nodes-pids'],
                                      [pid for pid in all_pids])
        if not self.runtime_config['download-later']:
            pool = Pool(processes=self.runtime_config['download_proc_num'])
            print("collecting results")
            while len(all_pids) > 0:
                time.sleep(3)
                all_running_jobs = [
                    (self, '_fetch_results_in_parallel', job_metadata[job_name],
                     node_pid_tuple, all_pids) for node_pid_tuple in \
                    job_metadata[job_name]['nodes-pids'] if
                    node_pid_tuple in all_pids]
                pool.map(self._execute_in_parallel, all_running_jobs)
                print("waiting for other results if any...")

            print("All of the remote results collected.")

    def set_param(self, target, name, parameter, value):
        """
        Used to set a specific parameter in the configuration

        :param target: the entity type on which the parameter is going to be set, e.g. runtime-config
        :param name: the entity name on which the parameter is going to be set, e.g. test-config32
        :param parameter: name of the parameter to be set
        :param value: value of that parameter to be set
        :return:
        """
        if target == 'virtual-cluster':
            self.vcluster_config.deep_set(['virtual-cluster', name, parameter],
                                          value)
            print(
                "Virtual-cluster parameter {} set to {} successfully.".format(
                    parameter, value))
        elif target == 'runtime-config':
            self.vcluster_config.deep_set(['runtime-config', name, parameter],
                                          value)
            print("Runtime-configuration parameter {} set to {} successfully.".
                  format(parameter, value))
        else:
            raise ValueError("Target of variable set not found.")
示例#8
0
class SlurmCluster(object):

    def __init__(self):
        """
        Initializes the SlurmCluster class

        """
        # current_path = os.path.dirname(os.path.realpath(__file__))
        # self.workspace = os.path.join(current_path, "batch_workspace/slurm_batch.yaml")
        # if not os.path.exists(os.path.dirname(self.workspace)):
        #     os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        # self.batch_config = GenericConfig(self.workspace)
        self.all_jobIDs = []
        self.slurm_cluster = {}
        self.job = {
            'job_name' : None,
            'cluster_name': None,
            'script_path': None,
            'executable_path': None,
            'destination': None,
            'source': None,
            'experiment_name': None,
            'companion_file': None,
        }
        self.database = CmDatabase()

    @staticmethod
    def job_specification():

        # self.job_validator()

        data = {
            "cm": {
                "cloud": "karst_debug",
                "kind": "batch-job",
                "name": "job012",
            },
            "batch": {
                "source": "~/.cloudmesh/batch/dir",
                "destination": "~/.cloudmesh/dir/",
                "status": "running"
            }
        }

        return data

    # @DatabaseUpdate
    # def status(self,job_name):
    #     return {
    #         "cloud": self.job.cluster_name,
    #
    #     }


    # noinspection PyDictCreation
    @DatabaseUpdate()
    def create(self,
               job_name,
               cluster_name,
               script_path,
               executable_path,
               destination,
               source,
               experiment_name,
               companion_file):
        """
        This method is used to create a job for running on remote slurm cluster

        :param job_name: name of the job to create
        :param cluster_name: slurm cluster on which the job is gonna run
        :param script_path: path of the slurm script
        :param executable_path: path of the executable that is going to be
        run on the cluster via slurm script
        :param destination: path in the remotes on which the scripts is
        gonna be copied to and ran from
        :param source: local path to which the results are gonna be copied
        :param experiment_name: experiment name and suffix of the filenames in
        the job
        :param companion_file: path of the file that has to be passed to the
        file as an argument if any
        :param overwrite: if the job already exists, this flag overwrites
        the previous job with the same name
        :return:
        """
        # if self.batch_config.get('job-metadata') is not None and job_name in \
        #         list(self.batch_config.get('job-metadata').keys()) and overwrite is False:
        #     raise RuntimeError("The job {} exists in the configuration file, if you want to overwrite the job, \
        #     use --overwrite argument.".format(job_name))

        # tmp_cluster = {cluster_name: dict(slurm_cluster)}
        # slurm_cluster = self.cm_config.get('cloudmesh').get('cluster')[cluster_name]
        # self.batch_config.deep_set(['slurm_cluster'], tmp_cluster)
        name = Name(order=["name","experiment_name"],
                    name=job_name,
                    experiment_name=experiment_name)
        uid = name.id(name=job_name, experiment_name=experiment_name)
        print(uid)
        # return
        # TODO: remove cloud and kind after fixing CmDatabased update
        self.job = {
            'uid': uid,
            "cloud": cluster_name,
            "kind": "batch-job",
            "name" :job_name,
            "cm": {
                "cloud": cluster_name,
                "kind": "batch-job",
                "name": job_name,
                "cluster": self.cm_config.get('cloudmesh').get('cluster')[cluster_name]
            },
            "batch": {
                "status": "pending",
                'script_path': script_path.as_posix(),
                'executable_path': executable_path.as_posix(),
                'destination': destination.as_posix(),
                'source': source.as_posix(),
                'experiment_name': experiment_name,
                'companion_file': companion_file.as_posix()
            }
        }

        # self.job = {
        #         "cloud": cluster_name,
        #         "kind": "batch-job",
        #         "name": job_name,
        #         "cluster": self.cm_config.get('cloudmesh').get('cluster')[
        #             cluster_name],
        #         "status": "pending",
        #         'script_path': script_path.as_posix(),
        #         'executable_path': executable_path.as_posix(),
        #         'destination': destination.as_posix(),
        #         'source': source.as_posix(),
        #         'experiment_name': experiment_name,
        #         'companion_file': companion_file.as_posix()
        # }

        # job['destination'] = os.path.join(job['remote_path'], job['script_name'])
        # job['remote_slurm_script_path'] = os.path.join(job['remote_path'], job['slurm_script_name'])

        # job_metadata = {job_name: job}

        # self.batch_config.deep_set(['job-metadata'], job_metadata)

        # data = self.job_specification()
        if self.database.exists(self.job)[0]:
            Console.error("Job already exists")
            return
        return [self.job]

    @staticmethod
    def _execute_in_parallel(func_args):
        """
        This is a method used for running methods in parallel

        :param func_args:
        :return:
        """
        target_class = func_args[0]
        method_to_call = getattr(target_class, func_args[1])
        args = list(func_args[2:])
        return method_to_call(*args)

    def _fetch_results_in_parallel(self, job_metadata, job_id, all_job_ids):
        """
        This method is used to fetch the results from remote nodes.

        :param job_metadata: the dictionary containing the information about the previously submitted job
        :param job_id: the tuple containing destination node, destination pid and destination node index when the job was submitted
        :param all_job_ids:
        :return:
        """
        dest_node_info = self.slurm_cluster
        path = path_expand(dest_node_info['credentials']['sshconfigpath'])
        dest_job_id = job_id
        ssh_caller = lambda *x: self._ssh(dest_node_info['name'], path, *x)
        scp_caller = lambda *x: self._scp(dest_node_info['name'], path, *x)
        #
        # use the qstat from cloudmesh, we have a whole library for that
        #
        ps_output = ssh_caller("qstat -u $USER | grep %s" % job_id)
        if len(ps_output) == 0 or ' c ' in ps_output.lower():

            if not os.path.exists(job_metadata['local_path']):
                os.makedirs(job_metadata['local_path'])
            # TODO: REPLACE WITH .format
            scp_caller('-r', '%s:%s' % (dest_node_info['name'], job_metadata['remote_path']),
                       os.path.join(job_metadata['local_path'], ''))
            os.remove(os.path.join(job_metadata['local_path'],
                                   os.path.basename(os.path.normpath(job_metadata['remote_path'])),
                                   job_metadata['script_name']))
            os.remove(os.path.join(job_metadata['local_path'],
                                   os.path.basename(os.path.normpath(job_metadata['remote_path'])),
                                   job_metadata['slurm_script_name']))
            if job_metadata['input_type'] == 'params+file':
                os.remove(os.path.join(job_metadata['local_path'],
                                       os.path.basename(os.path.normpath(job_metadata['remote_path'])),
                                       job_metadata['argfile_name']))
            all_job_ids.remove(dest_job_id)
            # TODO: REPLACE WITH .format
            print("Results collected from %s for jobID %s" % (dest_node_info['name'], dest_job_id))

    @staticmethod
    def _ssh(hostname, sshconfigpath, *args):
        """
        This method is used to create remove ssh connections

        :param hostname: hostname
        :param sshconfigpath: path to sshconfig for connecting to remote node
        :param args: the argument to be submitted via ssh
        :return:
        """
        hide_errors_flag = False
        if type(args[-1]) == bool:
            hide_errors_flag = True
            args = args[:-1]
        #
        # should we use cloudmesh.common.Shell
        # shoudl we have a better version of that
        #
        # (stdout, stderr) = SimpleShell(...)
        #
        ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        result = ssh.stdout.readline()
        if not result:
            error = ssh.stderr.readlines()
            if len(error) > 0 and hide_errors_flag == False:
                # TODO: REPLACE WITH .format
                print("ERROR in host %s: %s" % (hostname, error))
            return []
        else:
            try:
                return ''.join([chr(x) for x in result])
            except AttributeError:
                return [result.decode('utf-8').strip('\n')]

    @staticmethod
    def _scp(hostname, sshconfigpath, *args):
        """
        This method is used for scp from and to remote

        :param hostname: hostname
        :param sshconfigpath: ssh config file
        :param args:arguments for using while copying
        :return:
        """
        ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        middle_result = ssh.stdout.readlines()
        if not middle_result:
            error = ssh.stderr.readlines()
            if len(error) > 0:

                print("ERROR in host %s: %s" % (hostname, error))

    @staticmethod
    def add_suffix_to_path(path, suffix):
        """
        This method is used to add suffix to a path

        :param path: path
        :param suffix: suffix
        :return:
        """
        dir_path = os.path.dirname(path)
        full_filename = os.path.basename(path)
        filename, fileextention = os.path.splitext(full_filename)
        full_filename_new = filename + suffix + fileextention
        new_path = os.path.join(dir_path, full_filename_new)
        return new_path

    def clean_remote(self, job_name):
        """
        This method is used to spawn processes for cleaning the remote nodes

        :param job_name: name of previously submitted job for which the nodes are going to be cleaned
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        target_cluster_info = self.batch_config.get('slurm_cluster')[job_metadata['slurm_cluster_name']]
        remote_path = job_metadata['remote_path']

        ssh_caller = lambda *x: self._ssh(target_cluster_info['name'],
                                          os.path.expanduser(target_cluster_info['credentials'] \
                                                                 ['sshconfigpath']), *x, True)
        ssh_caller('rm -rf {}'.format(remote_path))
        if len(ssh_caller('ls {}'.format(remote_path))) == 0:
            print("Job {} cleaned successfully.".format(job_name))
        else:
            print("Error: Job {} could not be cleaned.".format(job_name))

    def connection_test(self, slurm_cluster_name):
        """
        This method is used for testing the connection to the slurm cluster connection node

        :param slurm_cluster_name: name of the slurm cluster which is going to be tested
        :return:
        """
        r = self.database.find_name("job_20190327_22265228")
        print(r)
        return
        target_node_info = self.batch_config.get('slurm_cluster')[slurm_cluster_name]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \
                                                                                           ['sshconfigpath']), *x)
        if len(ssh_caller('uname -a')) > 0:
            print("Slurm Cluster {} is accessible.".format(target_node_info['name']))
        else:
            print("Error: Slurm Cluster {} cannot be accessed.".format(target_node_info['name']))

    def remove(self, target, key):
        """
        Used to remove virtual clusters and runtime configs

        :param target: type of entity to be removed
        :param key: keyname of the entity to be removed
        :return:
        """
        if target == 'slurm-cluster':
            self.batch_config.remove(['slurm_cluster'], key)
            print("Slurm-cluster {} removed successfully.".format(key))
        elif target == 'job':
            self.batch_config.remove(['job-metadata'], key)
            print("Job {} removed successfully.".format(key))
        else:
            raise ValueError("Target to remove not found.")

    def fetch(self, job_name):
        """
        This method is used to fetch results from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        self.slurm_cluster = self.batch_config.get('slurm_cluster')[job_metadata['slurm_cluster_name']]
        loaded_all_job_ids = [x for x in job_metadata['jobIDs']]
        all_job_ids = Manager().list()
        all_job_ids.extend(loaded_all_job_ids)
        pool = Pool(processes=1)
        print("collecting results")
        while len(all_job_ids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, jobID, all_job_ids) for \
                                jobID in loaded_all_job_ids if jobID in all_job_ids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")

    '''
    @DatabaseUpdate
    def list(self, target, max_depth, current_depth=1, input_dict=None):
        """
        listing the target slurm clusters or job-metadata

        :param target: name of the virtual cluster to be listed
        :param max_depth: depth of information to be shown
        :param current_depth: current depth of printing information
        :param input_dict: used for recursion for depth of higher than 1
        :return:
        """
        if target == 'slurm-clusters' and input_dict is None:
            input_dict = self.batch_config.get('slurm_cluster')
        if target == 'jobs' and input_dict is None:
            input_dict = self.batch_config.get('job-metadata')
        elif input_dict is None:
            raise ValueError("Target of listing not found.")

        if max_depth >= current_depth:
            if type(input_dict) == dict:
                for key in input_dict:
                    key_to_print = key + ':' if max_depth >= current_depth else key
                    indent = current_depth if current_depth > 1 else current_depth - 1
                    print('\t' * indent, key_to_print)
                    if type(input_dict.get(key)) != dict:
                        print('\t' * (indent + 1), input_dict.get(key))
                    else:
                        for value in input_dict.get(key):
                            value_to_print = value + ':' if max_depth > current_depth else value
                            print('\t' * (indent + 1), value_to_print)
                            self.list(target, max_depth, input_dict=input_dict[key][value],
                                      current_depth=current_depth + 1)
            else:
                indent = current_depth if current_depth > 1 else current_depth - 1
                print('\t' * indent, input_dict)

        data = [{}, {}]
        return data
    '''

    def run(self, job_name):
        """
        This method is used to create a job, validate it and run it on remote nodes

        :param job_name: name of the job to create
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        all_job_ids = Manager().list()
        cluster_name = job_metadata['slurm_cluster_name']
        slurm_cluster = self.batch_config.get('slurm_cluster').get(cluster_name)
        path = path_expand(slurm_cluster['credentials']['sshconfigpath'])

        ssh_caller = lambda *x: self._ssh(slurm_cluster['name'], path, *x)
        scp_caller = lambda *x: self._scp(slurm_cluster['name'], path, *x)


        # TODO replace with .format
        ssh_caller('cd %s && mkdir job%s' % (job_metadata['raw_remote_path'], job_metadata['suffix']), True)
        scp_caller(job_metadata['slurm_script_path'],
                   '%s:%s' % (slurm_cluster['name'], job_metadata['remote_slurm_script_path']))
        scp_caller(job_metadata['job_script_path'],
                   '%s:%s' % (slurm_cluster['name'], job_metadata['remote_script_path']))
        ssh_caller('chmod +x', job_metadata['remote_script_path'])
        if job_metadata['input_type'].lower() == 'params+file':
            scp_caller(job_metadata['argfile_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_path']))

        remote_job_id = ssh_caller("cd %s && qsub %s && qstat -u $USER | tail -n 1 | awk '{print $1}'" %
                                   (job_metadata['remote_path'], job_metadata['remote_slurm_script_path']))
        remote_job_id = remote_job_id.strip('\n')
        all_job_ids.append(remote_job_id)
        print('Remote job ID: %s' % remote_job_id)
        self.batch_config.deep_set(['job-metadata', job_name, 'jobIDs'], [pid for pid in all_job_ids])

    def set_param(self, target, name, parameter, value):
        """
        Used to set a specific parameter in the configuration

        :param target: the entity type on which the parameter is going to be set, e.g. runtime-config
        :param name: the entity name on which the parameter is going to be set, e.g. test-config32
        :param parameter: name of the parameter to be set
        :param value: value of that parameter to be set
        :return:
        """
        # TODO: .format see if .format(**local) works
        if target == 'slurm-cluster':
            self.batch_config.deep_set(['slurm_cluster', name, parameter], value)
            print("slurm-cluster parameter {} set to {} successfully.".format(parameter, value))
        elif target == 'job-metadata':
            self.batch_config.deep_set(['job-metadata', name, parameter], value)
            print("Job-metadata parameter {} set to {} successfully.".format(parameter, value))
        else:
            raise ValueError("Target of variable set not found.")
示例#9
0
class Queue(object):
    def __init__(self):
        """
        Initializes the Queue class

        """

        self.cm_config = Config()
        self.info = {
            'uid': None,
            "cloud": None,
            "kind": "batch-queue",
            "name": None,
            "cm": {},
            "queue": {
                'policy': None,
                'status': None,
                'active': False,
                'charge': None,
                'unit': None,
                "numJobs": 0,
                "numRunningJobs": 0,
                'joblist': []
            }
        }
        self.database = CmDatabase()

    @DatabaseUpdate()
    def create(self, queue_name, cluster_name, policy, charge=None, unit=None):
        """
        This method is used to create a queue

        :param queue_name: name of the queue to create
        :param cluster_name: slurm cluster on which the job is gonna run
        :param policy: policy of the queue
        :param charge: charge of the queue
        :param unit: unit of the charge for the queue
        :return:
        """
        name = Name(order=["cloud", "name"],
                    cloud=cluster_name,
                    name=queue_name)
        uid = name.id(cloud=cluster_name, name=queue_name)
        print(uid)

        self.info = Munch({
            'uid': uid,
            "cloud": cluster_name,
            "kind": "batch-queue",
            "name": queue_name,
            "cm": {
                "cloud":
                cluster_name,
                "kind":
                "batch-queue",
                "name":
                queue_name,
                "cluster":
                self.cm_config.get('cloudmesh').get('cluster')[cluster_name]
            },
            "queue": {
                'policy': policy,
                'status': 'EMPTY',
                'active': False,
                'charge': charge,
                'unit': unit,
                "numJobs": 0,
                "numRunningJobs": 0,
            }
        })
        self.policyFunctionMap = Munch({
            'FIFO': self.popFIFO,
            'FILO': self.popFILO
        })
        # list of parameters that can be set
        self.settable_params = ['policy', 'charge', 'unit']
        if self.database.exists(self.info)[0]:
            Console.error("Queue already exists")
            return
        return [self.info]

    def findQueueByName(self, name):
        '''
        finds a queue in the database based on the name
        :param name: name of the queue
        :return:
        '''
        # TODO: find queue info from the DB and set it to self.info
        return

    def findQueueByCluster(self, clusterName):
        '''
        finds a queue in the database based on its cluster name
        :param name: name of the queue's cluster
        :return:
        '''
        # TODO: find queue info from the DB and set it to self.info
        return

    def findAllQueues(self):
        '''
        finds all queues in the database based on the name
        :return:
        '''
        # TODO: find all queues info from the DB based on the ['cm']
        return

    def listJobs(self):
        '''
        list the jobs in the current queue
        :return:
        '''
        return

    def removeQueue(self):
        '''
        remove the queue from the database
        :return:
        '''
        # TODO: remove the queues info from the DB based on the ['cm']
        return

    @DatabaseUpdate()  # this should update the record not create a new one
    def push(self, job):
        '''
        push job to stack
        :param job:
        :return:
        '''
        self.info.queue.joblist.append(job)
        self.info.queue.numJobs += 1
        self.updateStatus()
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def pop(self):
        '''
        pop job from stack based on the policy
        :param job:
        :return:
        '''

        self.info.queue.numJobs -= 1
        self.updateStatus()
        policy = self.info.queue.policy
        return self.policyFunctionMap[policy]()

    def popFIFO(self):
        '''
        pop job from stack based on FIFO policy
        :param job:
        :return:
        '''
        return self.info['queue']['joblist'].pop(0)

    def popFILO(self):
        '''
        pop job from stack based on FIFO policy
        :param job:
        :return:
        '''
        return self.info['queue']['joblist'].pop()

    def isEmpty(self):
        '''
        checks if the queue is empty
        :return:
        '''
        if self.info.queue.numJobs > 0:
            return False
        return True

    @DatabaseUpdate()  # this should update the record not create a new one
    def activate(self):
        '''
        activates the queue

        :return:
        '''
        # TODO: start submitting jobs, what's the rate for submission,
        #  is it parallel ?
        self.info.queue.active = True
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def deactivate(self):
        '''
        deactivates the queue
        :return:
        '''
        # TODO: stop all jobs
        self.info.queue.active = False
        return self.info

    @DatabaseUpdate()  # this should update the record not create a new one
    def updateStatus(self):
        '''
        checks number of jobs and updates queue status
        :return:
        '''
        if self.info.queue.numJobs > 0:
            self.info.queue.status = 'FULL'
        return self.info

    @DatabaseUpdate()
    def setParam(self, param, val):
        '''
        set a particular parameter in the queue
        :param param: the parameter
        :param val:  value of the parameter
        :return:
        '''
        if param in self.settable_params:
            self.info.queue[param] = val
        return self.info