def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory
        host = self.platform_config.platform_service_host

        try:
            directory = Directory('sftp://%s%s' % (host, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on PBS '
                      'submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory does not exist on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'PBS submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory already exists on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
Пример #2
0
    def collect_output(self, destination):
        # We're using the default implementation of the file transfer code
        # This doesn't take into account a different port for the remote host
        # connection. To work around this, we temporarily set the host property
        # to include the port and the revert to the original value after the
        # file transfer is complete.
        host_tmp = self.host
        self.host = ('%s:%s' % (self.host, self.port))

        # Using the base implementation of job output file collection...
        JobDeploymentBase.collect_output(self, destination)

        # If job_config delete_job_files is True, we can now delete the job
        # files on the remote platform
        if self.job_config.delete_job_files:
            jobs_dir = self.platform_config.storage_job_directory
            # Check that the job storage directory exists and then create a
            # sub-directory specifically for this job.
            try:
                LOG.debug('URL for file job directory: sftp://%s%s' %
                          (self.host, jobs_dir))
                directory = Directory('sftp://%s%s' % (self.host, jobs_dir),
                                      session=self.session)
            except saga.BadParameter as e:
                LOG.error('The specified job directory does not exist on '
                          'resource <%s> (%s).' % (self.host, str(e)))
                raise JobError('The specified job directory does not exist '
                               'on resource <%s> (%s)' % (self.host, str(e)))
            try:
                LOG.debug('Deleting job directory after job completion '
                          '<sftp://%s%s/%s>' %
                          (self.host, jobs_dir, self.job_config.job_id))
                directory.remove(self.job_config.job_id, RECURSIVE)
            except saga.NoSuccess as e:
                LOG.error('The specified job data directory couldn\'t be '
                          'removed <%s> (%s).' %
                          (self.job_config.job_id, str(e)))
                raise JobError('The specified job data directory couldn\'t be '
                               'removed <%s> (%s)' %
                               (self.job_config.job_id, str(e)))

        # Set the host value back to its original value
        self.host = host_tmp
Пример #3
0
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        LOG.debug('SSH Deployer: Transfer files...')
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        job_dir = self.platform_config.storage_job_directory
        # Check that the job storage directory exists and then create a
        # sub-directory specifically for this job.
        try:
            LOG.debug('URL for file transfer: <sftp://%s:%s%s>' %
                      (self.host, self.port, job_dir))
            directory = Directory('sftp://%s:%s%s' %
                                  (self.host, self.port, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('Error setting up connection to resource directory.')
            if 'connection refused' in str(e).lower():
                raise ConnectionError('Unable to connect to remote resource '
                                      'to set up connection to directory.')

            raise StorageDirectoryNotFoundError(
                'The specified job data base '
                'directory does not exist on resource <%s> (%s)' %
                (self.host, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'resource <%s> (%s).' % (self.host, str(e)))
            raise DirectoryExistsError('The specified job directory already '
                                       'exists on resource <%s> (%s)' %
                                       (self.host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
    def _setup_job_account(self, pty_conn, platform_config):
        user_id = platform_config.user_id
        user_home = platform_config.user_home
        public_key = platform_config.user_public_key
        admin_user = platform_config.image_unconfigured_admin_key_user

        # Creating the job user on the remote node
        LOG.debug(
            'Creating job user account for user <%s> on remote node <%s>' %
            (user_id, pty_conn.url))
        # First check if the user directory exists
        cmd = 'sudo test -d %s'
        result, out, err = pty_conn.run_sync(cmd % (user_home))
        if result != 1:
            raise JobError(
                'The specified user home directory <%s> for the job '
                'user <%s> already exists. Unable to proceed with '
                'resource configuration.' % (user_home, user_id))
        cmd = 'useradd -d %s -m %s'
        if admin_user != 'root':
            cmd = 'sudo ' + cmd
        result, out, err = pty_conn.run_sync(cmd % (user_home, user_id))
        LOG.debug('useradd command completed - Exit code: <%s>, '
                  'StdOut: <%s>, StdErr:\n<%s>' % (result, out, err))

        # Check if user home created during user account creation
        # If account already existed, we may need to create the directory here
        try:
            home_dir = Directory(pty_conn.url + user_home,
                                 session=pty_conn.session)
        except BadParameter:
            # Assume home directory doesn't exist and create it here.
            rootdir = Directory(pty_conn.url + '/', session=pty_conn.session)
            rootdir.make_dir(user_home)
            home_dir = Directory(pty_conn.url + user_home,
                                 session=pty_conn.session)

        try:
            home_dir.make_dir(os.path.join(user_home, '.ssh'))
        except saga.NoSuccess as e:
            if 'exists' in str(e):
                LOG.debug('Directory <%s> already exists...' %
                          os.path.join(user_home, '.ssh'))
            else:
                raise JobError('Unable to create the SSH directory in user '
                               'home <%s>...' %
                               os.path.join(user_home, '.ssh'))

        try:
            home_dir.make_dir(platform_config.storage_job_directory)
        except saga.NoSuccess as e:
            if 'exists' in str(e):
                LOG.debug('Job data directory <%s> already exists...' %
                          platform_config.storage_job_directory)
            else:
                raise JobError('Unable to create platform data directory '
                               '<%s>.' % platform_config.storage_job_directory)

        # Write the public key to the authorized keys file on the remote node
        pty_conn.write_to_remote(
            public_key, os.path.join(user_home, '.ssh', 'authorized_keys'))

        # Change ownership of all created directories/files to the job user
        pty_conn.run_sync('chown -R %s:%s %s' % (user_id, user_id, user_home))
    def _wait_for_node_accessbility_saga(self,
                                         node_ip_list,
                                         user_id,
                                         key_file,
                                         port=22,
                                         retries=3,
                                         pre_check_delay=10):
        # Using saga to check if remote resources are accessible
        #retries = 3
        retries = 5
        attempts_made = 0
        connection_successful = False

        LOG.debug('Waiting <%s> seconds to check for resource accessibility.' %
                  (pre_check_delay))
        time.sleep(pre_check_delay)

        # Create an empty session with no contexts
        self.session = saga.Session(default=False)
        if self.admin_ctx:
            self.session.add_context(self.admin_ctx)
        else:
            self.session.add_context(self.job_ctx)

        # TODO: Shouldn't try other security contexts until we've tried one
        # context with all nodes, at present the connection fails because we
        # switch contexts before checking each node...
        while attempts_made < retries and not connection_successful:
            nodes_ok = []
            for ip in node_ip_list:
                try:
                    LOG.debug('Attempt <%s> to connect to remote resource '
                              '<%s> using SAGA...' % (attempts_made + 1, ip))
                    dir_obj = Directory('sftp://%s/' % ip,
                                        session=self.session)
                    LOG.debug('Triggering connection to remote node by '
                              'attempting root dir list...')
                    dir_obj.list()
                    LOG.debug('Connected to remote node successfully...')
                    dir_obj.close()
                    LOG.debug('Closed connection to remote node...')
                    nodes_ok.append(ip)
                except socket.timeout:
                    LOG.debug('Timed out trying to connect to <%s>...' % ip)
                except OSError as e:
                    LOG.debug('OSError trying to connect to <%s>: %s' %
                              (ip, str(e)))
                except NoSuccess as e:
                    LOG.debug(
                        'NoSuccess making connection to resource <%s>: %s' %
                        (ip, str(e)))
                except BadParameter as e:
                    LOG.debug('BadParameter making connection to resource <%s>'
                              ': %s' % (ip, str(e)))
                except AuthenticationFailed as e:
                    LOG.debug('Authentication failure when making connection '
                              'to resource <%s>: %s\nTrying next security '
                              'context...' % (ip, str(e)))
                    raise NoSuccess('No valid security context for '
                                    'connection to resource <%s>.' % ip)

            node_ip_list = [
                item for item in node_ip_list if item not in nodes_ok
            ]
            # if node list is empty and all nodes are running set flag to true
            if not node_ip_list:
                connection_successful = True
            attempts_made += 1

            if not connection_successful and attempts_made < retries:
                wait_time = 10 * attempts_made
                LOG.debug(
                    'Waiting <%s> seconds before retrying connection...' %
                    wait_time)
                time.sleep(wait_time)

        if not connection_successful:
            LOG.debug('ERROR: Unable to connect to remote node...')
        else:
            LOG.debug('**** SAGA CONNECTION TO REMOTE NODE(S) SUCCESSFUL ****')

        return connection_successful
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory

        # At this point we need to switch back to using the job secruity
        # context. If we were using unconfigured resources, these will have
        # been configured using an admin context by now.
        self.session = saga.Session(default=False)
        self.session.add_context(self.job_ctx)

        # Begin by checking if we're working with more than one instance, if
        # so we have a master and one or more slave nodes. We'll push the data
        # to the master and then direct the master to distribute it to the
        # slave nodes.
        master_node = self.running_nodes[0][0]
        slave_nodes = []
        if len(self.running_nodes) > 1:
            slave_nodes = [node[0] for node in self.running_nodes[1:]]

        # On the master node: Check that the job storage directory exists and
        # then create a sub-directory specifically for this job.

        # Node is a tuple consisting of two items, the node object and an
        # IP list. For now we work with the node object directly.
        node_ip = master_node.public_ips[0]
        try:
            directory = Directory('sftp://%s%s' % (node_ip, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on node '
                      '<%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory does not exist '
            #               'on node <%s> (%s)' % (node_ip, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.warning('The specified job data directory already exists on '
                        'node <%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory already exists on '
            #               'on node <%s> (%s)' % (node_ip, str(e)))

        job_data_dir = os.path.join(str(directory.url), self.job_config.job_id)

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)

        # At this point input files have been successfully transferred to
        # the master node. We now direct the master node to send the files
        # to each of the slave nodes:
        if slave_nodes:
            slave_private_ips = [node.private_ips[0] for node in slave_nodes]
            self._distribute_job_data(master_node.public_ips[0],
                                      slave_private_ips,
                                      self.platform_config.user_id,
                                      self.platform_config.user_key_file,
                                      job_dir, self.job_config.job_id)
    def _setup_job_account(self, pty_conn, platform_config):
        user_id = platform_config.user_id
        user_home = platform_config.user_home
        public_key = platform_config.user_public_key
        admin_user = platform_config.image_unconfigured_admin_key_user

        # Creating the job user on the remote node
        LOG.debug(
            'Creating job user account for user <%s> on remote node <%s>' %
            (user_id, pty_conn.url))
        # First check if the user directory exists
        cmd = 'sudo test -d %s'
        result, out, err = pty_conn.run_sync(cmd % (user_home))
        if result != 1:
            raise JobError(
                'The specified user home directory <%s> for the job '
                'user <%s> already exists. Unable to proceed with '
                'resource configuration.' % (user_home, user_id))
        cmd = 'useradd -d %s -m %s'
        if admin_user != 'root':
            cmd = 'sudo ' + cmd
        result, out, err = pty_conn.run_sync(cmd % (user_home, user_id))
        LOG.debug('useradd command completed - Exit code: <%s>, '
                  'StdOut: <%s>, StdErr:\n<%s>' % (result, out, err))

        # Check if user home created during user account creation
        # If account already existed, we may need to create the directory here
        try:
            home_dir = Directory(pty_conn.url + user_home,
                                 session=pty_conn.session)
        except BadParameter:
            # Assume home directory doesn't exist and create it here.
            cmd = 'mkdir -p %s' % user_home
            if pty_conn.session.contexts[0].user_id != 'root':
                cmd = 'sudo ' + cmd
            res, out, err = pty_conn.run_sync(cmd)
            LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' %
                      (res, out, err))
            if res != 0:
                raise JobError('Unable to create the user home directory '
                               '<%s>...' % user_home)
            home_dir = Directory(pty_conn.url + user_home,
                                 session=pty_conn.session)

#         try:
#             home_dir.make_dir(os.path.join(user_home,'.ssh'))
#         except saga.NoSuccess as e:
#             if 'exists' in str(e):
#                 LOG.debug('Directory <%s> already exists...'
#                           % os.path.join(user_home,'.ssh'))
#             else:
#                 raise JobError('Unable to create the SSH directory in user '
#                                'home <%s>...' % os.path.join(user_home,'.ssh'))

        cmd = 'mkdir -p %s' % os.path.join(user_home, '.ssh')
        if pty_conn.session.contexts[0].user_id != 'root':
            cmd = 'sudo ' + cmd
        res, out, err = pty_conn.run_sync(cmd)
        LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' %
                  (os.path.join(user_home, '.ssh'), res, out, err))
        if res != 0:
            raise JobError('Unable to create the SSH directory in user '
                           'home <%s>...' % os.path.join(user_home, '.ssh'))

        cmd = 'mkdir -p %s' % platform_config.storage_job_directory
        if pty_conn.session.contexts[0].user_id != 'root':
            cmd = 'sudo ' + cmd
        res, out, err = pty_conn.run_sync(cmd)
        LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' %
                  (platform_config.storage_job_directory, res, out, err))
        if res != 0:
            raise JobError('Unable to create platform data directory '
                           '<%s>.' % platform_config.storage_job_directory)

        # TODO: Need a much nicer way of handling this. Since
        # write_to_remote might upload the file as a non-root user into a
        # directory created with admin rights and hence owned by root, if the
        # user used by pty_conn is not root, we temporarily change ownership of
        # the home directory to the current user, write the public key and
        # then apply the chown of all files to the libhpc user...
        if pty_conn.session.contexts[0].user_id != 'root':
            current_user = pty_conn.session.contexts[0].user_id
            cmd = 'sudo chown -R %s:%s %s' % (current_user, current_user,
                                              user_home)
            pty_conn.run_sync(cmd)

        # Write the public key to the authorized keys file on the remote node
        pty_conn.write_to_remote(
            public_key, os.path.join(user_home, '.ssh', 'authorized_keys'))

        # Change ownership of the authorised keys file just created...
        cmd = 'chown -R %s:%s %s' % (user_id, user_id, user_home)
        if pty_conn.session.contexts[0].user_id != 'root':
            cmd = 'sudo ' + cmd
        pty_conn.run_sync(cmd)