def initialise_resources(self, *args, **kwargs): JobDeploymentBase.initialise_resources(self) # Resource initialisation is not required directly but we use this # function to intiialise the connection with the PBS platform via the # SAGA-Python library. self.svc = saga.job.Service('pbs+ssh://%s/' % self.platform_config.platform_service_host, session=self.session) return None
def initialise_resources(self, resource_config=None, num_resources=1, resource_type='m1.small', job_id=None): JobDeploymentBase.initialise_resources(self) # Start up the cloud resources here and wait for them to reach the # running state. Need to know the image ID that we're starting. The # image ID is available from the job configuration image_id = None image_id_configured = self.job_config.image_id_pre_configured image_id_unconfigured = self.job_config.image_id_unconfigured if image_id_configured and not image_id_unconfigured: image_id = image_id_configured LOG.debug('Only a configured image identifier has been provided, ' 'using image ID <%s>.' % image_id) elif (not image_id_configured) and image_id_unconfigured: if not resource_config: LOG.error('Only an unconfigured image ID provided but ' 'no resource configuration has been provided.') raise ResourceInitialisationError('ERROR: Only an unconfigured ' 'image type is available but no image ' 'configuration has been provided.') image_id = image_id_unconfigured LOG.debug('Only an unconfigured image identifier has been ' 'provided, using image ID <%s>.' % image_id) elif image_id_configured and image_id_unconfigured: image_id = image_id_unconfigured if resource_config else image_id_configured LOG.debug('Both configured and unconfigured images provided, ' 'using image ID <%s>.' % image_id) else: raise ResourceInitialisationError('ERROR: No image information ' 'available in the platform configuration, unable ' 'to initialise resources.') # Check that the image is present and then use the libcloud driver to # start the resources and return once they're running. # TODO: This is currently synchronous but could also be done # asynchronously using a callback to notify the caller when the nodes # are ready. #images = self.driver.list_images() #img = next((i for i in images if i.id == image_id), None) #if not img: try: img = self.driver.get_image(image_id) except socket.error as e: img = None raise ResourceInitialisationError('ERROR contacting the remote ' 'cloud platform. Do you have an active network ' 'connection? - <%s>' % str(e)) except: img = None raise ResourceInitialisationError('ERROR: The specified image <%s> ' 'is not present on the target platform, unable ' 'to start resources.' % image_id) sizes = self.driver.list_sizes() size = next((s for s in sizes if s.name == resource_type), None) if not size: raise ResourceInitialisationError('ERROR: The specified resource ' 'size <%s> is not present on the target platform. ' 'Unable to start resources.' % resource_type) # Get the keypair name from the configuration keypair_name = self.job_config.key_name # At this point we know that the image is available and the specified # resource type is valid so we can request to start the instance(s) LOG.debug('About to start <%s> resources of type <%s> based on image ' '<%s (%s)> with keypair <%s>.' % (num_resources, size.name, img.id, img.name, keypair_name)) # When starting a resource we need the name, image, type, keypair, # configuration data and details of the number of resources to start. name = job_id if not name: name = generate_instance_id() self.driver.create_node(name=name, image=img, size=size, ex_keyname=keypair_name) return
def initialise_resources(self, *args, **kwargs): JobDeploymentBase.initialise_resources(self) LOG.debug('SSH Deployer: Initialise resources - Nothing to do here...') return None
def initialise_resources(self, prefer_unconfigured=True, num_processes=1, processes_per_node=1, node_type='m1.small', job_id=None, retries=3, software_config=None): JobDeploymentBase.initialise_resources(self) # Start up the cloud resources here and wait for them to reach the # running state. Need to know the image ID that we're starting. The # image ID is available from the job configuration image_id = None image_preconfigured_id = self.platform_config.image_preconfigured_id image_unconfigured_id = self.platform_config.image_unconfigured_id # Store whether or not we're using an unconfigured image - this # determines whether we end up running the deploy software function # or not. self.use_unconfigured = False if image_preconfigured_id and not image_unconfigured_id: image_id = image_preconfigured_id LOG.debug('Only a configured image identifier has been provided, ' 'using image ID <%s>.' % image_id) elif (not image_preconfigured_id) and image_unconfigured_id: image_id = image_unconfigured_id self.use_unconfigured = True LOG.debug('Only an unconfigured image identifier has been ' 'provided, using image ID <%s>.' % image_id) if not software_config: raise JobError( 'Only an unconfigured image identifier has been ' 'provided but no software config has been specified. ' 'Unable to continue...') elif image_preconfigured_id and image_unconfigured_id: LOG.debug('Both configured and unconfigured images provided...') if prefer_unconfigured: image_id = image_unconfigured_id self.use_unconfigured = True LOG.debug('Using unconfigured image ID <%s>.' % image_id) if not software_config: raise JobError( 'An unconfigured image identifier has been ' 'chosen but no software config has been specified. ' 'Unable to continue...') else: image_id = image_preconfigured_id LOG.debug('Using pre-configured image ID <%s>.' % image_id) else: raise ResourceInitialisationError( 'ERROR: No image information ' 'available in the platform configuration, unable ' 'to initialise resources.') # If we're using an unconfigured image, we need to prepare the admin # security context based on the information that should be provided # in the YAML file with the unconfigured image details. if self.use_unconfigured: self.admin_ctx = saga.Context("ssh") self.admin_ctx.user_id = self.platform_config.image_unconfigured_admin_key_user self.admin_ctx.user_key = self.platform_config.image_unconfigured_admin_key_file # Check that the image is present and then use the libcloud driver to # start the resources and return once they're running. # TODO: This is currently synchronous but could also be done # asynchronously using a callback to notify the caller when the nodes # are ready. #images = self.driver.list_images() #img = next((i for i in images if i.id == image_id), None) #if not img: img = None try: #img = self.driver.get_image(image_id) images = self.driver.list_images() for image in images: if image.id == image_id: img = image break if img == None: raise ResourceInitialisationError('The specified image <%s> ' 'could not be found' % image_id) except socket.error as e: img = None raise ResourceInitialisationError( 'ERROR contacting the remote ' 'cloud platform. Do you have an active network ' 'connection? - <%s>' % str(e)) except Exception as e: LOG.debug('ERROR STRING: %s' % str(e)) img = None if str(e).startswith('Unauthorized:'): raise InvalidCredentialsError( 'ERROR: Access to the cloud ' 'platform at <%s> was not authorised. Are your ' 'credentials correct?' % (self.platform_config.platform_service_host + ':' + str(self.platform_config.platform_service_port))) else: raise ResourceInitialisationError( 'ERROR: The specified image <%s> ' 'is not present on the target platform, unable ' 'to start resources.' % image_id) sizes = self.driver.list_sizes() size = next((s for s in sizes if s.id == node_type), None) if not size: raise ResourceInitialisationError( 'ERROR: The specified resource ' 'size (node_type) <%s> is not present on the ' 'target platform. Unable to start resources. Have ' 'you set the node_type parameter in your job spec?' % node_type) # Get the keypair name from the configuration # If we're using an unconfigured resource, we use the admin key pair # name if provided. if self.use_unconfigured and self.platform_config.image_unconfigured_admin_key_name: keypair_name = self.platform_config.image_unconfigured_admin_key_name else: keypair_name = self.platform_config.user_key_name # Get the number of resources from the job configuration # TODO: Fix this to obtain number of cores per node from the cloud # cloud platform. For now use the specified processes_per_node in the # job specification. cores_per_node = processes_per_node #cores_per_node = self.RESOURCE_TYPE_CORES[node_type] #if cores_per_node < processes_per_node: # LOG.debug('A processes_per_node value <%s> greater than the number ' # 'of cores in a node <%s> has been specified. Altering ' # 'processes per node to the maximum available on this ' # 'node type <%s>.' % (processes_per_node, cores_per_node, # node_type)) # processes_per_node = cores_per_node num_nodes = int(ceil(float(num_processes) / float(processes_per_node))) # At this point we know that the image is available and the specified # resource type is valid so we can request to start the instance(s) LOG.debug('About to start <%s> resources of type <%s> based on image ' '<%s (%s)> with keypair <%s>.' % (num_nodes, size.name, img.id, img.name, keypair_name)) # When starting a resource we need the name, image, type, keypair, # configuration data and details of the number of resources to start. name = job_id if not name: name = generate_instance_id() self.nodes = self.driver.create_node(name=name, image=img, size=size, ex_keyname=keypair_name, ex_mincount=num_nodes, ex_maxcount=num_nodes) if type(self.nodes) != type([]): self.nodes = [self.nodes] self.running_nodes = self.driver.wait_until_running(self.nodes) # Before we return details of the running nodes, we need to check # that they're accessible - it takes some time for the nodes to boot # and become available. We do this by setting up a handle to a # directory - we assume all nodes have a '/' directory - and then # trying to list that directory. If an exception is thrown, we assume # that the nodes are not yet available. # TODO: Need to replace this wait with a reliable check as to whether # the server is up and running. Looks like, for now, this will need to # use Paramiko while awaiting updates on saga-python. #LOG.debug('Waiting 60 seconds for node to boot...') #time.sleep(60) # Replaced 60 second wait with check using Paramiko to see if # resource is accessible... LOG.debug('Checking node is available...') nodes_to_check = [] for node in self.running_nodes: nodes_to_check.append(node[0].public_ips[0]) res = self._wait_for_node_accessbility( nodes_to_check, self.platform_config.user_id, self.platform_config.user_key_file, retries=retries) if not res: # We still have nodes that are not avialable so assume there's a # problem and throw a job error. raise JobError('After <%s> retries, the following nodes are ' 'still not accessible <%s>. Cancelling job.' % (retries, nodes_to_check)) # If we have multiple nodes, now is the time to create the machinefile # for MPI job runs # For the machinefile we need the private IP of each node and the # number of cores. machinefile = tempfile.NamedTemporaryFile('w', delete=True) machinefile.write("# Machine file for MPI job runs\n") for node in self.running_nodes: machinefile.write( '%s slots=%s max_slots=%s\n' % (node[0].private_ips[0], cores_per_node, cores_per_node)) machinefile.flush() LOG.debug('The following machinefile has been created:\n\n%s\n' % machinefile.name) # The master node is always considered to be node 0 in # the self.running_nodes list. LOG.debug('Copying machinefile to master node...') saga_machinefile = File('file://%s' % machinefile.name, session=self.session) saga_machinefile.copy('sftp://%s/tmp/machinefile' % self.running_nodes[0][0].public_ips[0]) machinefile.close() LOG.debug('machinefile copied to master node...') conn = PTYShell('ssh://%s' % self.running_nodes[0][0].public_ips[0], session=self.session) conn.run_sync('chmod 644 /tmp/machinefile') LOG.debug('Set permissions on /tmp/machinefile on master node to 644.') return self.running_nodes