def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory
        host = self.platform_config.platform_service_host

        try:
            directory = Directory('sftp://%s%s' % (host, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on PBS '
                      'submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory does not exist on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'PBS submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory already exists on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
Exemplo n.º 2
0
    def collect_output(self, destination):
        # Here we collect the output from the remote cloud nodes when a job has
        # completed, the output data is transferred to the specified location
        # for storage so that it is available for users to collect it after
        # the cloud resources have been shut down.
        LOG.debug('Collect output...')

        # TODO: Need to bundle the output files into a tar or similar archive
        # to pull them back to the host. There may be a large number of files
        # so this is preferable to pulling each file back separately.
        # For now we pull back files individually

        #=======================================================================
        # # ### Looking at running a separate SSH job to bundle the  output
        # # ### files into an archive that can then be transferred back.
        # # ### TODO: Need to find a cross-platform way of handling this.
        #=======================================================================

        # Work out whether we have an array of running nodes (e.g. cloud nodes)
        # or whether we're dealing with a single host. If the former is true
        # then we get the IP/hostname of the target resource from the
        # running_nodes array, otherwise we can just use the host variable.

        remote_host = self.host if not getattr(
            self, 'running_nodes',
            None) else self.running_nodes[0][0].public_ips[0]
        LOG.debug('Remote host for file transfer source: %s' % remote_host)

        LOG.debug('Preparing output archiving job...')
        archive_file = self.job_config.job_id + '.tar.gz'
        jd = Description()
        jd.environment = getattr(self.job_config, 'environment', {})
        jd.executable = 'touch'
        jd.arguments = ['.', ';', 'tar', 'zcvf', archive_file, '*']
        jd.working_directory = getattr(self.job_config, 'working_dir', None)
        self.svc = Service('ssh://%s/' % remote_host, session=self.session)
        self.job = self.svc.create_job(jd)
        LOG.debug('Running output archiving job...')
        self.job.run()
        self.job.wait()
        LOG.debug('Output archiving job complete...')

        working_dir = getattr(self.job_config, 'working_dir', None)
        if not working_dir:
            raise ValueError('There is no working directory set. Unable to '
                             'retrieve output files.')

        # Get a list of the directories to pull the output files back from
        # TODO: For now we just pull the archive file from the master node
        # but assume that we also need to consider output egnerated on other
        # nodes.
        output_files = []
        #output_file_dirs = []
        #for node in self.running_nodes:
        #    node_ip = node.public_ips[0]
        #    output_file_dirs.append('sftp://%s%s' % (node_ip, working_dir))
        output_file_archive = 'sftp://%s%s' % (
            remote_host, os.path.join(working_dir, archive_file))

        LOG.debug('Output file archive: %s' % output_file_archive)

        output_files.append(output_file_archive)

        LOG.debug('Got output files: %s' % output_files)

        parsed_destination = urlparse.urlparse(destination)
        if parsed_destination.scheme == '':
            destination = 'file://' + destination

        for output_file in output_files:
            of = File(output_file, session=self.session)
            of.copy(destination)
Exemplo n.º 3
0
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        LOG.debug('SSH Deployer: Transfer files...')
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        job_dir = self.platform_config.storage_job_directory
        # Check that the job storage directory exists and then create a
        # sub-directory specifically for this job.
        try:
            LOG.debug('URL for file transfer: <sftp://%s:%s%s>' %
                      (self.host, self.port, job_dir))
            directory = Directory('sftp://%s:%s%s' %
                                  (self.host, self.port, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('Error setting up connection to resource directory.')
            if 'connection refused' in str(e).lower():
                raise ConnectionError('Unable to connect to remote resource '
                                      'to set up connection to directory.')

            raise StorageDirectoryNotFoundError(
                'The specified job data base '
                'directory does not exist on resource <%s> (%s)' %
                (self.host, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'resource <%s> (%s).' % (self.host, str(e)))
            raise DirectoryExistsError('The specified job directory already '
                                       'exists on resource <%s> (%s)' %
                                       (self.host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory

        # At this point we need to switch back to using the job secruity
        # context. If we were using unconfigured resources, these will have
        # been configured using an admin context by now.
        self.session = saga.Session(default=False)
        self.session.add_context(self.job_ctx)

        # Begin by checking if we're working with more than one instance, if
        # so we have a master and one or more slave nodes. We'll push the data
        # to the master and then direct the master to distribute it to the
        # slave nodes.
        master_node = self.running_nodes[0][0]
        slave_nodes = []
        if len(self.running_nodes) > 1:
            slave_nodes = [node[0] for node in self.running_nodes[1:]]

        # On the master node: Check that the job storage directory exists and
        # then create a sub-directory specifically for this job.

        # Node is a tuple consisting of two items, the node object and an
        # IP list. For now we work with the node object directly.
        node_ip = master_node.public_ips[0]
        try:
            directory = Directory('sftp://%s%s' % (node_ip, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on node '
                      '<%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory does not exist '
            #               'on node <%s> (%s)' % (node_ip, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.warning('The specified job data directory already exists on '
                        'node <%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory already exists on '
            #               'on node <%s> (%s)' % (node_ip, str(e)))

        job_data_dir = os.path.join(str(directory.url), self.job_config.job_id)

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)

        # At this point input files have been successfully transferred to
        # the master node. We now direct the master node to send the files
        # to each of the slave nodes:
        if slave_nodes:
            slave_private_ips = [node.private_ips[0] for node in slave_nodes]
            self._distribute_job_data(master_node.public_ips[0],
                                      slave_private_ips,
                                      self.platform_config.user_id,
                                      self.platform_config.user_key_file,
                                      job_dir, self.job_config.job_id)
    def deploy_software(self, software_config=None):
        JobDeploymentBase.deploy_software(self)
        # Here we undertake transfer of the code to the remote platform if this
        # is required. In many cases, the software is likely to already be
        # deployed on the target platform or may have been configured via a
        # tool such as cloud-init, puppet, etc at resource initialisation time.
        LOG.debug('Deploy software...')

        # If we're not using an unconfigured image, we don't need to run the
        # deploy software function
        if not self.use_unconfigured:
            LOG.info('Using a pre-configured image so running software '
                     'deployment process...')
            return

        # Software deployment requires root access to the target node(s). This
        # should be possible using the key that has been passed to start the
        # reosurce(s).

        # If no software configuration is provided, we ignore this function
        # call and return. If a configuration is provided, we check that the
        # configuration is for the right type of platform and then deploy
        # the software.
        if not software_config:
            return

        if type(software_config) != type([]):
            software_config = [software_config]

        LOG.debug('Received a request to deploy the following software '
                  'configuration IDs to the target platforms: <%s>...' %
                  software_config)

        # Check that we have an admin security context available. If we don't
        # we can't connect to the remote resource(s) to do the required
        # configuration
        if not self.admin_ctx:
            raise JobError(
                'deploy_software: There is no admin context '
                'available so it will not be possible to connect '
                'to remote resources to configure them. Ensure that '
                '')

        # Check that we can get each of the software configurations and that
        # each one supports the target deployment platform.
        scm = SoftwareConfigManager.get_instance()
        scm.init_configuration()

        os_name = self.platform_config.image_unconfigured_os
        flavour = self.platform_config.image_unconfigured_flavour
        admin_key_user = self.platform_config.image_unconfigured_admin_key_user
        admin_key_file = self.platform_config.image_unconfigured_admin_key_file

        sc_dict = {}
        for sc in software_config:
            try:
                conf = scm.get_software_configuration(sc)
                sc_dict[sc] = conf
            except ValueError as e:
                raise JobError('Job error - no software could be found for '
                               'the configuration id <%s>: %s' % (sc, str(e)))

            if not ((os_name == conf.software_os_type) and
                    (flavour == conf.software_os_flavour)):
                LOG.error(
                    'The OS <%s> and flavour <%s> in the provided software '
                    'configuration don\'t match the target platform with '
                    'OS <%s> and flavour <%s>.' %
                    (conf.software_os_type, conf.software_os_flavour, os_name,
                     flavour))
                raise JobError(
                    'The OS <%s> and flavour <%s> in the provided '
                    'software configuration don\'t match the target '
                    'platform with OS <%s> and flavour <%s>.' %
                    (conf.software_os_type, conf.software_os_flavour, os_name,
                     flavour))

        # If we reach this point we assume that each of the software
        # configurations has been found and they are for the right target
        # platform.
        for sc_key in sc_dict.keys():
            sc_obj = sc_dict[sc_key]
            install_commands = sc_obj.get_install_commands()

            # Now run each of the install commands synchronously on all of the
            # target machines to get the software installed.
            node_ips = [node[0].public_ips[0] for node in self.running_nodes]
            LOG.debug('Deploying to the following list of nodes: %s' %
                      node_ips)

            # Set up a new session using the admin user and key provided for
            # the unconfigured image.
            adm_session = saga.Session(default=False)
            adm_ctx = saga.Context("ssh")
            adm_ctx.user_id = admin_key_user
            adm_ctx.user_key = admin_key_file
            adm_session.add_context(adm_ctx)
            # Open shell connections to each of the machines
            shell_conns = []
            opts = {}
            opts['ssh_options'] = {'StrictHostKeyChecking': 'no'}
            for node_ip in node_ips:
                conn = PTYShell('ssh://%s' % node_ip,
                                session=adm_session,
                                opts=opts)
                shell_conns.append(conn)
                if conf.software_os_type == 'linux':
                    self._setup_job_account(conn, self.platform_config)
                else:
                    LOG.warning(
                        'Support for creation of job accounts on '
                        'platforms other than linux is not yet supported...')
            # Copy the job account key to the master node
            job_session = saga.Session(default=False)
            job_session.add_context(self.job_ctx)

            keyfile = File('file://%s' % self.platform_config.user_key_file,
                           session=job_session)
            keyfile_target = shell_conns[0].url + os.path.join(
                self.platform_config.user_home, '.ssh', 'id_rsa')
            LOG.debug('Copying job key to target directory <%s>' %
                      keyfile_target)
            keyfile.copy(keyfile_target)
            for cmd in install_commands:
                for shell_connection in shell_conns:
                    if isinstance(cmd, SoftwareConfigFile):
                        LOG.debug(
                            'Software deployment: About to write data to '
                            'remote file <%s> on node <%s>' %
                            (cmd.filename, shell_connection.url))
                        shell_connection.write_to_remote(
                            cmd.data, cmd.filename)
                    else:
                        LOG.debug('Software deployment: About to run command '
                                  '<%s> on resource <%s>...' %
                                  (cmd, shell_connection.url))
                        if admin_key_user != 'root':
                            cmd = 'sudo ' + cmd
                        result, out, err = shell_connection.run_sync(cmd)
                        LOG.debug('Command completed - Exit code: <%s>, '
                                  'StdOut: <%s>, StdErr:\n<%s>' %
                                  (result, out, err))
    def initialise_resources(self,
                             prefer_unconfigured=True,
                             num_processes=1,
                             processes_per_node=1,
                             node_type='m1.small',
                             job_id=None,
                             retries=3,
                             software_config=None):
        JobDeploymentBase.initialise_resources(self)
        # Start up the cloud resources here and wait for them to reach the
        # running state. Need to know the image ID that we're starting. The
        # image ID is available from the job configuration
        image_id = None
        image_preconfigured_id = self.platform_config.image_preconfigured_id
        image_unconfigured_id = self.platform_config.image_unconfigured_id

        # Store whether or not we're using an unconfigured image - this
        # determines whether we end up running the deploy software function
        # or not.
        self.use_unconfigured = False
        if image_preconfigured_id and not image_unconfigured_id:
            image_id = image_preconfigured_id
            LOG.debug('Only a configured image identifier has been provided, '
                      'using image ID <%s>.' % image_id)
        elif (not image_preconfigured_id) and image_unconfigured_id:
            image_id = image_unconfigured_id
            self.use_unconfigured = True
            LOG.debug('Only an unconfigured image identifier has been '
                      'provided, using image ID <%s>.' % image_id)
            if not software_config:
                raise JobError(
                    'Only an unconfigured image identifier has been '
                    'provided but no software config has been specified. '
                    'Unable to continue...')
        elif image_preconfigured_id and image_unconfigured_id:
            LOG.debug('Both configured and unconfigured images provided...')
            if prefer_unconfigured:
                image_id = image_unconfigured_id
                self.use_unconfigured = True
                LOG.debug('Using unconfigured image ID <%s>.' % image_id)
                if not software_config:
                    raise JobError(
                        'An unconfigured image identifier has been '
                        'chosen but no software config has been specified. '
                        'Unable to continue...')
            else:
                image_id = image_preconfigured_id
                LOG.debug('Using pre-configured image ID <%s>.' % image_id)
        else:
            raise ResourceInitialisationError(
                'ERROR: No image information '
                'available in the platform configuration, unable '
                'to initialise resources.')

        # If we're using an unconfigured image, we need to prepare the admin
        # security context based on the information that should be provided
        # in the YAML file with the unconfigured image details.
        if self.use_unconfigured:
            self.admin_ctx = saga.Context("ssh")
            self.admin_ctx.user_id = self.platform_config.image_unconfigured_admin_key_user
            self.admin_ctx.user_key = self.platform_config.image_unconfigured_admin_key_file

        # Check that the image is present and then use the libcloud driver to
        # start the resources and return once they're running.
        # TODO: This is currently synchronous but could also be done
        # asynchronously using a callback to notify the caller when the nodes
        # are ready.

        #images = self.driver.list_images()
        #img = next((i for i in images if i.id == image_id), None)
        #if not img:

        img = None
        try:
            #img = self.driver.get_image(image_id)
            images = self.driver.list_images()
            for image in images:
                if image.id == image_id:
                    img = image
                    break
            if img == None:
                raise ResourceInitialisationError('The specified image <%s> '
                                                  'could not be found' %
                                                  image_id)
        except socket.error as e:
            img = None
            raise ResourceInitialisationError(
                'ERROR contacting the remote '
                'cloud platform. Do you have an active network '
                'connection? - <%s>' % str(e))
        except Exception as e:
            LOG.debug('ERROR STRING: %s' % str(e))
            img = None
            if str(e).startswith('Unauthorized:'):
                raise InvalidCredentialsError(
                    'ERROR: Access to the cloud '
                    'platform at <%s> was not authorised. Are your '
                    'credentials correct?' %
                    (self.platform_config.platform_service_host + ':' +
                     str(self.platform_config.platform_service_port)))
            else:
                raise ResourceInitialisationError(
                    'ERROR: The specified image <%s> '
                    'is not present on the target platform, unable '
                    'to start resources.' % image_id)

        sizes = self.driver.list_sizes()
        size = next((s for s in sizes if s.id == node_type), None)
        if not size:
            raise ResourceInitialisationError(
                'ERROR: The specified resource '
                'size (node_type) <%s> is not present on the '
                'target platform. Unable to start resources. Have '
                'you set the node_type parameter in your job spec?' %
                node_type)

        # Get the keypair name from the configuration
        # If we're using an unconfigured resource, we use the admin key pair
        # name if provided.
        if self.use_unconfigured and self.platform_config.image_unconfigured_admin_key_name:
            keypair_name = self.platform_config.image_unconfigured_admin_key_name
        else:
            keypair_name = self.platform_config.user_key_name

        # Get the number of resources from the job configuration
        # TODO: Fix this to obtain number of cores per node from the cloud
        # cloud platform. For now use the specified processes_per_node in the
        # job specification.
        cores_per_node = processes_per_node
        #cores_per_node = self.RESOURCE_TYPE_CORES[node_type]
        #if cores_per_node < processes_per_node:
        #    LOG.debug('A processes_per_node value <%s> greater than the number '
        #              'of cores in a node <%s> has been specified. Altering '
        #              'processes per node to the maximum available on this '
        #              'node type <%s>.' % (processes_per_node, cores_per_node,
        #                                   node_type))
        #    processes_per_node = cores_per_node
        num_nodes = int(ceil(float(num_processes) / float(processes_per_node)))

        # At this point we know that the image is available and the specified
        # resource type is valid so we can request to start the instance(s)
        LOG.debug('About to start <%s> resources of type <%s> based on image '
                  '<%s (%s)> with keypair <%s>.' %
                  (num_nodes, size.name, img.id, img.name, keypair_name))

        # When starting a resource we need the name, image, type, keypair,
        # configuration data and details of the number of resources to start.
        name = job_id
        if not name:
            name = generate_instance_id()

        self.nodes = self.driver.create_node(name=name,
                                             image=img,
                                             size=size,
                                             ex_keyname=keypair_name,
                                             ex_mincount=num_nodes,
                                             ex_maxcount=num_nodes)

        if type(self.nodes) != type([]):
            self.nodes = [self.nodes]

        self.running_nodes = self.driver.wait_until_running(self.nodes)

        # Before we return details of the running nodes, we need to check
        # that they're accessible - it takes some time for the nodes to boot
        # and become available. We do this by setting up a handle to a
        # directory - we assume all nodes have a '/' directory - and then
        # trying to list that directory. If an exception is thrown, we assume
        # that the nodes are not yet available.

        # TODO: Need to replace this wait with a reliable check as to whether
        # the server is up and running. Looks like, for now, this will need to
        # use Paramiko while awaiting updates on saga-python.
        #LOG.debug('Waiting 60 seconds for node to boot...')
        #time.sleep(60)
        # Replaced 60 second wait with check using Paramiko to see if
        # resource is accessible...
        LOG.debug('Checking node is available...')

        nodes_to_check = []
        for node in self.running_nodes:
            nodes_to_check.append(node[0].public_ips[0])

        res = self._wait_for_node_accessbility(
            nodes_to_check,
            self.platform_config.user_id,
            self.platform_config.user_key_file,
            retries=retries)
        if not res:
            # We still have nodes that are not avialable so assume there's a
            # problem and throw a job error.
            raise JobError('After <%s> retries, the following nodes are '
                           'still not accessible <%s>. Cancelling job.' %
                           (retries, nodes_to_check))

        # If we have multiple nodes, now is the time to create the machinefile
        # for MPI job runs
        # For the machinefile we need the private IP of each node and the
        # number of cores.
        machinefile = tempfile.NamedTemporaryFile('w', delete=True)
        machinefile.write("# Machine file for MPI job runs\n")
        for node in self.running_nodes:
            machinefile.write(
                '%s slots=%s max_slots=%s\n' %
                (node[0].private_ips[0], cores_per_node, cores_per_node))
        machinefile.flush()
        LOG.debug('The following machinefile has been created:\n\n%s\n' %
                  machinefile.name)

        # The master node is always considered to be node 0 in
        # the self.running_nodes list.
        LOG.debug('Copying machinefile to master node...')
        saga_machinefile = File('file://%s' % machinefile.name,
                                session=self.session)
        saga_machinefile.copy('sftp://%s/tmp/machinefile' %
                              self.running_nodes[0][0].public_ips[0])
        machinefile.close()
        LOG.debug('machinefile copied to master node...')

        conn = PTYShell('ssh://%s' % self.running_nodes[0][0].public_ips[0],
                        session=self.session)
        conn.run_sync('chmod 644 /tmp/machinefile')
        LOG.debug('Set permissions on /tmp/machinefile on master node to 644.')

        return self.running_nodes