def load_many(self, obj_ids, add=True, flush=True): """ Load objects given their IDs from persistent storage. Return a dictionary mapping task ID to the actual retrieved `Task`:class: object. """ tasks = {} for task_id in obj_ids: try: tasks[task_id] = self.load(task_id, add, flush) except Exception as err: if gc3libs.error_ignored( # context: # - module 'session', # - class 'Session', # - method 'load', # - actual error class err.__class__.__name__, # - additional keywords 'persistence', ): gc3libs.log.warning("Ignoring error from loading '%s': %s", task_id, err) else: # propagate exception back to caller raise return tasks
def _load_session(self, **extra_args): """ Load an existing session from disk. Keyword arguments are passed to the `make_store` factory method unchanged. Any error that occurs while loading jobs from disk is ignored. """ try: store_fname = os.path.join(self.path, self.STORE_URL_FILENAME) self.store_url = gc3libs.utils.read_contents(store_fname).strip() except IOError: gc3libs.log.info( "Unable to load session: file %s is missing." % (store_fname)) raise self.store = gc3libs.persistence.make_store( self.store_url, **extra_args) idx_filename = os.path.join(self.path, self.INDEX_FILENAME) try: idx_fd = open(idx_filename) ids = idx_fd.read().split() idx_fd.close() except: idx_fd.close() raise try: start_file = os.path.join( self.path, self.TIMESTAMP_FILES['start']) self.created = os.stat(start_file).st_mtime except OSError: gc3libs.log.warning( "Unable to recover starting time from existing session:" " file %s is missing." % (start_file)) for task_id in ids: try: self.tasks[task_id] = self.store.load(task_id) except Exception as err: if gc3libs.error_ignored( # context: # - module 'session', # - class 'Session', # - method 'load', # - actual error class err.__class__.__name__, # - additional keywords 'persistence', ): gc3libs.log.warning( "Ignoring error from loading '%s': %s", task_id, err) else: # propagate exception back to caller raise
def _load_session(self, **extra_args): """ Load an existing session from disk. Keyword arguments are passed to the `make_store` factory method unchanged. Any error that occurs while loading jobs from disk is ignored. """ try: store_fname = os.path.join(self.path, self.STORE_URL_FILENAME) self.store_url = gc3libs.utils.read_contents(store_fname).strip() except IOError: gc3libs.log.info("Unable to load session: file %s is missing." % (store_fname)) raise self.store = gc3libs.persistence.make_store(self.store_url, **extra_args) idx_filename = os.path.join(self.path, self.INDEX_FILENAME) try: idx_fd = open(idx_filename) ids = idx_fd.read().split() idx_fd.close() except: idx_fd.close() raise try: start_file = os.path.join(self.path, self.TIMESTAMP_FILES['start']) self.created = os.stat(start_file).st_mtime except OSError: gc3libs.log.warning( "Unable to recover starting time from existing session:" " file %s is missing." % (start_file)) for task_id in ids: try: self.tasks[task_id] = self.store.load(task_id) except Exception as err: if gc3libs.error_ignored( # context: # - module 'session', # - class 'Session', # - method 'load', # - actual error class err.__class__.__name__, # - additional keywords 'persistence', ): gc3libs.log.warning("Ignoring error from loading '%s': %s", task_id, err) else: # propagate exception back to caller raise
def submit_job(self, job): """ Submission on an OpenStack resource will usually happen in multiple steps, since creating a VM and attaching a resource to it will take some time. In order to return as soon as possible, the backend will raise a `RecoverableError` whenever submission is delayed. In case a permanent error is found (for instance, we cannot create VMs on the cloud), a `UnrecoverableError` is raised. More in detail, when during submission the following will happen: * First of all, the backend will try to submit the job to one of the already available subresources. * If none of them is able to sbmit the job, the backend will check if there is a VM in pending state, and in case there is one it will raise a `RecoverableError`, thus delaying submission. * If no VM in pending state is found, the `vm_pool_max_size` configuration option is checked. If we already reached the maximum number of VM, a `UnrecoverableError` is raised. * If no VM in pending state is found but `vm_pool_max_size` is still lesser than the number of VM currently created (or it is None, which for us means no limit), then a new VM is created, and `RecoverableError` is raised. """ # Updating resource is needed to update the subresources. This # is not always done before the submit_job because of issue # nr. 386: # http://code.google.com/p/gc3pie/issues/detail?id=386 self.get_resource_status() pending_vms = set(vm.id for vm in self._vmpool.get_all_vms() if vm.status in PENDING_STATES) image_id = self.get_image_id_for_job(job) # Check if the image id is valid if image_id not in [img.id for img in self._get_available_images()]: raise ConfigurationError("Image ID %s not found in cloud " "%s" % (image_id, self.os_auth_url)) instance_type = self.get_instance_type_for_job(job) if not instance_type: raise RuntimeError("Unable to find a suitable instance type for " "application %s" % job) # First of all, try to submit to one of the subresources. for vm_id, subresource in self.subresources.items(): if not subresource.updated: # The VM is probably still booting, let's skip to the # next one and add it to the list of "pending" VMs. pending_vms.add(vm_id) continue try: # Check that the required image id and instance type # are correct vm = self._get_vm(vm_id) if vm.image["id"] != image_id: continue subresource.submit_job(job) job.os_instance_id = vm_id job.changed = True gc3libs.log.info("Job successfully submitted to remote resource %s.", subresource.name) return job except (LRMSSubmitError, InstanceNotFound) as ex: if gc3libs.error_ignored( # context: # - module "openstack", # - class "OpenStackLrms", # - method "submit_job", # - actual error class ex.__class__.__name__, # - additional keywords "submit", ): gc3libs.log.debug("Ignoring error in submitting to resource '%s': %s", subresource.name, ex) else: # propagate error back to caller raise # Couldn't submit to any resource. if not pending_vms: # No pending VM, and no resource available. Create a new VM if not self.vm_pool_max_size or len(self._vmpool) < self.vm_pool_max_size: user_data = self.get_user_data_for_job(job) vm = self._create_instance( image_id, name="GC3Pie_%s_%d" % (self.name, (len(self._vmpool) + 1)), instance_type=instance_type, user_data=user_data, ) pending_vms.add(vm.id) self._vmpool.add_vm(vm) else: raise MaximumCapacityReached( "Already running the maximum number of VM on resource %s:" " %d VMs started, but max %d allowed by configuration." % (self.name, len(self._vmpool), self.vm_pool_max_size), do_log=True, ) # If we reached this point, we are waiting for a VM to be # ready, so delay the submission until we wither can submit to # one of the available resources or until all the VMs are # ready. gc3libs.log.debug( "No available resource was found, but some VM is still in" " `pending` state. Waiting until the next iteration before" " creating a new VM. Pending VM ids: %s", pending_vms, ) raise LRMSSkipSubmissionToNextIteration( "Delaying submission until some of the VMs currently pending" " is ready. Pending VM ids: %s" % str.join(", ", pending_vms) )
def get_resource_status(self): self.updated = False # Since we create the resource *before* the VM is actually up # & running, it's possible that the `frontend` value of the # resources points to a non-existent hostname. Therefore, we # have to update them with valid public_ip, if they are # present. # Update status of known VMs for vm_id in self._vmpool: try: vm = self._vmpool.get_vm(vm_id, force_reload=True) except UnrecoverableError as ex: gc3libs.log.warning( "Removing stale information on VM `%s`. It has probably" " been deleted from outside GC3Pie.", vm_id ) self._vmpool.remove_vm(vm_id) continue if vm.status in PENDING_STATES: # If VM is still in pending state, skip creation of # the resource continue elif vm.status in ERROR_STATES: # The VM is in error state: exit. gc3libs.log.error("VM with id `%s` is in ERROR state." " Terminating it!", vm.id) vm.delete() self._vmpool.remove_vm(vm.id) self.subresources.pop(vm.id) continue elif vm.status == "DELETED": gc3libs.log.info( "VM `%s` in DELETE state. It has probably been terminated" " from outside GC3Pie. Removing it from the list of VM.", vm.id, ) self._vmpool.remove_vm(vm.id) self.subresources.pop(vm.id) continue elif vm.status in ["SHUTOFF", "SUSPENDED", "RESCUE", "VERIFY_RESIZE"]: # The VM has probably ben stopped or shut down from # outside GC3Pie. gc3libs.log.error("VM with id `%s` is in permanent state `%s`.", vm.id, vm.status) continue # Get or create a resource associated to the vm subresource = self._get_subresource(vm) try: subresource.get_resource_status() except TransportError as ex: # TODO: get all the IPs and try with all of them to connect. # Start with preferred_ip if defined gc3libs.log.info( "Ignoring error in updating resource '%s': %s." " Trying other IPs.", subresource.name, ex ) for ip in sum(vm.networks.values(), []): if vm.preferred_ip == ip: continue vm.preferred_ip = ip subresource.frontend = ip gc3libs.log.info("Connection error. Trying with alternate IP address " "%s", vm.preferred_ip) try: subresource.get_resource_status() break except Exception as ex: gc3libs.log.info( "Ignoring error in updating resource '%s': %s." " The corresponding VM may not be ready yet.", subresource.name, ex, ) # Unable to connect to the VM using any IP. Ensure # this resource is considered "pending" as we couldn't # update its status subresource.updated = False except Exception as ex: # XXX: Actually, we should try to identify the kind of # error we are getting. For instance, if the # configuration options `username` is wrong, we will # create VMs but we will never be able to submit jobs # to them, thus causing an increasing number of # useless VMs created on the cloud. if gc3libs.error_ignored( # context: # - module "openstack", # - class "OpenStackLrms", # - method "get_resource_status", # - actual error class ex.__class__.__name__, # - additional keywords "resource", "status", "update", "vm", ): gc3libs.log.info( "Ignoring error while updating resource '%s'. " "The corresponding VM may not be ready yet. Error: %s", subresource.name, ex, ) else: # propagate exception back to caller raise self._vmpool.update() return self
def submit_job(self, job): """ Submission on an EC2 resource will usually happen in multiple steps, since creating a VM and attaching a resource to it will take some time. In order to return as soon as possible, the backend will raise a `RecoverableError` whenever submission is delayed. In case a permanent error is found (for instance, we cannot create VMs on the cloud), a `UnrecoverableError` is raised. More in detail, when during submission the following will happen: * First of all, the backend will try to submit the job to one of the already available subresources. * If none of them is able to sbmit the job, the backend will check if there is a VM in pending state, and in case there is one it will raise a `RecoverableError`, thus delaying submission. * If no VM in pending state is found, the `vm_pool_max_size` configuration option is checked. If we already reached the maximum number of VM, a `UnrecoverableError` is raised. * If no VM in pending state is found but `vm_pool_max_size` is still lesser than the number of VM currently created (or it is None, which for us means no limit), then a new VM is created, and `RecoverableError` is raised. """ self._connect() # Updating resource is needed to update the subresources. This # is not always done before the submit_job because of issue # nr. 386: # http://code.google.com/p/gc3pie/issues/detail?id=386 self.get_resource_status() pending_vms = set(vm.id for vm in self._vmpool.get_all_vms() if vm.state == 'pending') image_id = self.get_image_id_for_job(job) instance_type = self.get_instance_type_for_job(job) # Check that we can actually submit to a flavor like this # XXX: this check shouldn't be done by the Engine??? if self._instance_type_specs: specs = self._instance_type_specs max_mem = specs['max_memory_per_core'] max_cpus = specs['max_cores_per_job'] if (job.requested_memory is not None and job.requested_memory > max_mem) \ or (job.requested_cores is not None and job.requested_cores > max_cpus): raise gc3libs.exceptions.LRMSSubmitError( "EC2 flavor %s does not have enough memory/cpus " "to run application %s" % ( self.instance_type, job.jobname)) # First of all, try to submit to one of the subresources. for vm_id, resource in self.subresources.items(): if not resource.updated: # The VM is probably still booting, let's skip to the # next one and add it to the list of "pending" VMs. pending_vms.add(vm_id) continue try: # Check that the required image id and instance type # are correct vm = self._get_vm(vm_id) if (vm.image_id != image_id or vm.instance_type != instance_type): continue resource.submit_job(job) job.ec2_instance_id = vm_id job.changed = True gc3libs.log.info( "Job successfully submitted to remote resource %s.", resource.name) return job except gc3libs.exceptions.LRMSSubmitError as ex: if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'submit_job', # - actual error class ex.__class__.__name__, # - additional keywords 'submit', ): gc3libs.log.debug( "Ignoring error in submitting to resource %s: %s. ", resource.name, ex) else: # propagate exception to caller raise # Couldn't submit to any resource. if not pending_vms: # No pending VM, and no resource available. Create a new VM if not self.vm_pool_max_size \ or len(self._vmpool) < self.vm_pool_max_size: user_data = self.get_user_data_for_job(job) vm = self._create_instance(image_id, instance_type=instance_type, user_data=user_data) pending_vms.add(vm.id) self._vmpool.add_vm(vm) else: raise MaximumCapacityReached( "Already running the maximum number of VM on resource %s:" " %d VMs started, but max %d allowed by configuration." % (self.name, len(self._vmpool), self.vm_pool_max_size), do_log=True) # If we reached this point, we are waiting for a VM to be # ready, so delay the submission until we wither can submit to # one of the available resources or until all the VMs are # ready. gc3libs.log.debug( "No available resource was found, but some VM is still in" " `pending` state. Waiting until the next iteration before" " creating a new VM. Pending VM ids: %s", pending_vms) raise LRMSSkipSubmissionToNextIteration( "Delaying submission until some of the VMs currently pending" " is ready. Pending VM ids: %s" % str.join(', ', pending_vms))
def get_resource_status(self): self.updated = False # Since we create the resource *before* the VM is actually up # & running, it's possible that the `frontend` value of the # resources points to a non-existent hostname. Therefore, we # have to update them with valid public_ip, if they are # present. self._connect() # Update status of known VMs for vm_id in self._vmpool: try: vm = self._vmpool.get_vm(vm_id) except UnrecoverableError as ex: gc3libs.log.warning( "Removing stale information on VM `%s`. It has probably" " been deleted from outside GC3Pie.", vm_id) self._vmpool.remove_vm(vm_id) continue vm.update() if vm.state == 'pending': # If VM is still in pending state, skip creation of # the resource continue elif vm.state == 'error': # The VM is in error state: exit. gc3libs.log.error( "VM with id `%s` is in ERROR state." " Terminating it!", vm.id) vm.terminate() self._vmpool.remove_vm(vm.id) elif vm.state == 'terminated': gc3libs.log.info( "VM `%s` in TERMINATED state. It has probably been" " terminated from outside GC3Pie. Removing it from the" " list of VM.", vm.id) self._vmpool.remove_vm(vm.id) elif vm.state in ['shutting-down', 'stopped']: # The VM has probably ben stopped or shut down from # outside GC3Pie. gc3libs.log.error( "VM with id `%s` is in terminal state `%s`.", vm.id, vm.state) # Get or create a resource associated to the vm resource = self._get_subresource(vm) try: resource.get_resource_status() except TransportError as ex: for ip in [vm.public_dns_name, vm.private_ip_address]: if vm.preferred_ip == ip: continue vm.preferred_ip = ip resource.frontend = ip gc3libs.log.info( "Connection error. Trying with secondary IP" " address %s", vm.preferred_ip) try: resource.get_resource_status() break except Exception as ex: # XXX: I'm exempting this from the GC3Pie # `error_ignored()` policy, since this is a kind of # "expected" error -- it *will* happen if the VM has # not booted up yet or if we're hunting for the correct # address. gc3libs.log.debug( "Ignoring error in updating resource %s: %s" " The corresponding VM may not be ready yet.", resource.name, ex) except Exception as ex: # FIXME: Actually, we should try to identify the kind of # error we are getting. For instance, if the # configuration options `username` is wrong, we will # create VMs but we will never be able to submit jobs # to them, thus causing an increasing number of # useless VMs created on the cloud. if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'get_resource_status', # - actual error class ex.__class__.__name__, # - additional keywords 'vm', ): gc3libs.log.info( "Ignoring error in updating resource %s: %s" "The corresponding VM may not be ready yet.", resource.name, ex) else: # propagate exception to caller raise if resource.updated: # Update also the instance_type specs, if not # already updated if not self._instance_type_specs: specs = self._instance_type_specs specs['architecture'] = resource['architecture'] specs['max_cores'] = resource['max_cores'] specs['max_cores_per_job'] = resource['max_cores_per_job'] specs['max_memory_per_core'] = resource['total_memory'] self.update(specs) self._vmpool.update() return self
def _setup_security_groups(self): """ Check the current configuration and set up the security group if it does not exist. """ if not self.security_group_name: gc3libs.log.error("Group name in `security_group_name`" " configuration option cannot be empty!") return security_groups = self._conn.get_all_security_groups() groups = dict((g.name, g) for g in security_groups) # Check if the security group exists already if self.security_group_name not in groups: try: gc3libs.log.info("Creating security group %s", self.security_group_name) security_group = self._conn.create_security_group( self.security_group_name, "GC3Pie_%s" % self.security_group_name) except Exception as ex: gc3libs.log.error("Error creating security group %s: %s", self.security_group_name, ex) raise UnrecoverableError( "Error creating security group %s: %s" % (self.security_group_name, ex)) for rule in self.security_group_rules: try: gc3libs.log.debug( "Adding rule %s to security group %s.", rule, self.security_group_name) security_group.authorize(**rule) except Exception as ex: if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'setup_security_groups', # - actual error class ex.__class__.__name__, # - additional keywords 'setup', 'security', 'network', 'cloud', ): gc3libs.log.info( "Ignoring error adding rule %s" " to security group %s: %s", rule, self.security_group_name, ex) else: # propagate exception to caller raise else: # Check if the security group has all the rules we want security_group = groups[self.security_group_name] current_rules = [] for rule in security_group.rules: rule_dict = { 'ip_protocol': rule.ip_protocol, 'from_port': int(rule.from_port), 'to_port': int(rule.to_port), 'cidr_ip': str(rule.grants[0]), } current_rules.append(rule_dict) for new_rule in self.security_group_rules: if new_rule not in current_rules: security_group.authorize(**new_rule)
def submit_job(self, job): """ Submission on an EC2 resource will usually happen in multiple steps, since creating a VM and attaching a resource to it will take some time. In order to return as soon as possible, the backend will raise a `RecoverableError` whenever submission is delayed. In case a permanent error is found (for instance, we cannot create VMs on the cloud), a `UnrecoverableError` is raised. More in detail, when during submission the following will happen: * First of all, the backend will try to submit the job to one of the already available subresources. * If none of them is able to sbmit the job, the backend will check if there is a VM in pending state, and in case there is one it will raise a `RecoverableError`, thus delaying submission. * If no VM in pending state is found, the `vm_pool_max_size` configuration option is checked. If we already reached the maximum number of VM, a `UnrecoverableError` is raised. * If no VM in pending state is found but `vm_pool_max_size` is still lesser than the number of VM currently created (or it is None, which for us means no limit), then a new VM is created, and `RecoverableError` is raised. """ self._connect() # Updating resource is needed to update the subresources. This # is not always done before the submit_job because of issue # nr. 386: # https://github.com/uzh/gc3pie/issues/386 self.get_resource_status() pending_vms = set(vm.id for vm in self._vmpool.get_all_vms() if vm.state == 'pending') image_id = self.get_image_id_for_job(job) instance_type = self.get_instance_type_for_job(job) # Check that we can actually submit to a flavor like this # XXX: this check shouldn't be done by the Engine??? if self._instance_type_specs: specs = self._instance_type_specs max_mem = specs['max_memory_per_core'] max_cpus = specs['max_cores_per_job'] if (job.requested_memory is not None and job.requested_memory > max_mem) \ or (job.requested_cores is not None and job.requested_cores > max_cpus): raise gc3libs.exceptions.LRMSSubmitError( "EC2 flavor %s does not have enough memory/cpus " "to run application %s" % (self.instance_type, job.jobname)) # First of all, try to submit to one of the subresources. for vm_id, resource in self.subresources.items(): if not resource.updated: # The VM is probably still booting, let's skip to the # next one and add it to the list of "pending" VMs. pending_vms.add(vm_id) continue try: # Check that the required image id and instance type # are correct vm = self._get_vm(vm_id) if (vm.image_id != image_id or vm.instance_type != instance_type): continue resource.submit_job(job) job.execution._lrms_vm_id = vm_id job.changed = True gc3libs.log.info( "Job successfully submitted to remote resource %s.", resource.name) return job except gc3libs.exceptions.LRMSSubmitError as ex: if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'submit_job', # - actual error class ex.__class__.__name__, # - additional keywords 'submit', ): gc3libs.log.debug( "Ignoring error in submitting to resource %s: %s. ", resource.name, ex) else: # propagate exception to caller raise # Couldn't submit to any resource. if not pending_vms: # No pending VM, and no resource available. Create a new VM if not self.vm_pool_max_size \ or len(self._vmpool) < self.vm_pool_max_size: user_data = self.get_user_data_for_job(job) vm = self._create_instance(image_id, instance_type=instance_type, user_data=user_data) pending_vms.add(vm.id) self._vmpool.add_vm(vm) else: raise MaximumCapacityReached( "Already running the maximum number of VM on resource %s:" " %d VMs started, but max %d allowed by configuration." % (self.name, len(self._vmpool), self.vm_pool_max_size), do_log=True) # If we reached this point, we are waiting for a VM to be # ready, so delay the submission until we wither can submit to # one of the available resources or until all the VMs are # ready. gc3libs.log.debug( "No available resource was found, but some VM is still in" " `pending` state. Waiting until the next iteration before" " creating a new VM. Pending VM ids: %s", pending_vms) raise ResourceNotReady( "Delaying submission until one of the VMs currently pending" " is ready. (Pending VM ids: %r)" % (pending_vms, ))
def get_resource_status(self): self.updated = False # Since we create the resource *before* the VM is actually up # & running, it's possible that the `frontend` value of the # resources points to a non-existent hostname. Therefore, we # have to update them with valid public_ip, if they are # present. self._connect() # Update status of known VMs for vm_id in self._vmpool: try: vm = self._vmpool.get_vm(vm_id) except UnrecoverableError as ex: gc3libs.log.warning( "Removing stale information on VM `%s`. It has probably" " been deleted from outside GC3Pie.", vm_id) self._vmpool.remove_vm(vm_id) continue vm.update() if vm.state == 'pending': # If VM is still in pending state, skip creation of # the resource continue elif vm.state == 'error': # The VM is in error state: exit. gc3libs.log.error( "VM with id `%s` is in ERROR state." " Terminating it!", vm.id) vm.terminate() self._vmpool.remove_vm(vm.id) elif vm.state == 'terminated': gc3libs.log.info( "VM `%s` in TERMINATED state. It has probably been" " terminated from outside GC3Pie. Removing it from the" " list of VM.", vm.id) self._vmpool.remove_vm(vm.id) elif vm.state in ['shutting-down', 'stopped']: # The VM has probably ben stopped or shut down from # outside GC3Pie. gc3libs.log.error("VM with id `%s` is in terminal state `%s`.", vm.id, vm.state) # Get or create a resource associated to the vm resource = self._get_subresource(vm) try: resource.get_resource_status() except TransportError as ex: for ip in [vm.public_dns_name, vm.private_ip_address]: if vm.preferred_ip == ip: continue vm.preferred_ip = ip resource.frontend = ip gc3libs.log.info( "Connection error. Trying with secondary IP" " address %s", vm.preferred_ip) try: resource.get_resource_status() break except Exception as ex: # XXX: I'm exempting this from the GC3Pie # `error_ignored()` policy, since this is a kind of # "expected" error -- it *will* happen if the VM has # not booted up yet or if we're hunting for the correct # address. gc3libs.log.debug( "Ignoring error in updating resource %s: %s" " The corresponding VM may not be ready yet.", resource.name, ex) except Exception as ex: # FIXME: Actually, we should try to identify the kind of # error we are getting. For instance, if the # configuration options `username` is wrong, we will # create VMs but we will never be able to submit jobs # to them, thus causing an increasing number of # useless VMs created on the cloud. if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'get_resource_status', # - actual error class ex.__class__.__name__, # - additional keywords 'vm', ): gc3libs.log.info( "Ignoring error in updating resource %s: %s" "The corresponding VM may not be ready yet.", resource.name, ex) else: # propagate exception to caller raise if resource.updated: # Update also the instance_type specs, if not # already updated if not self._instance_type_specs: specs = self._instance_type_specs specs['architecture'] = resource['architecture'] specs['max_cores'] = resource['max_cores'] specs['max_cores_per_job'] = resource['max_cores_per_job'] specs['max_memory_per_core'] = resource['total_memory'] self.update(specs) self._vmpool.update() return self
def _setup_security_groups(self): """ Check the current configuration and set up the security group if it does not exist. """ if not self.security_group_name: gc3libs.log.error("Group name in `security_group_name`" " configuration option cannot be empty!") return security_groups = self._conn.get_all_security_groups() groups = dict((g.name, g) for g in security_groups) # Check if the security group exists already if self.security_group_name not in groups: try: gc3libs.log.info("Creating security group %s", self.security_group_name) security_group = self._conn.create_security_group( self.security_group_name, "GC3Pie_%s" % self.security_group_name) except Exception as ex: gc3libs.log.error("Error creating security group %s: %s", self.security_group_name, ex) raise UnrecoverableError( "Error creating security group %s: %s" % (self.security_group_name, ex)) for rule in self.security_group_rules: try: gc3libs.log.debug("Adding rule %s to security group %s.", rule, self.security_group_name) security_group.authorize(**rule) except Exception as ex: if gc3libs.error_ignored( # context: # - module 'ec2', # - class 'EC2Lrms', # - method 'setup_security_groups', # - actual error class ex.__class__.__name__, # - additional keywords 'setup', 'security', 'network', 'cloud', ): gc3libs.log.info( "Ignoring error adding rule %s" " to security group %s: %s", rule, self.security_group_name, ex) else: # propagate exception to caller raise else: # Check if the security group has all the rules we want security_group = groups[self.security_group_name] current_rules = [] for rule in security_group.rules: rule_dict = { 'ip_protocol': rule.ip_protocol, 'from_port': int(rule.from_port), 'to_port': int(rule.to_port), 'cidr_ip': str(rule.grants[0]), } current_rules.append(rule_dict) for new_rule in self.security_group_rules: if new_rule not in current_rules: security_group.authorize(**new_rule)
def submit_job(self, job): """ Submission on an OpenStack resource will usually happen in multiple steps, since creating a VM and attaching a resource to it will take some time. In order to return as soon as possible, the backend will raise a `RecoverableError` whenever submission is delayed. In case a permanent error is found (for instance, we cannot create VMs on the cloud), a `UnrecoverableError` is raised. More in detail, when during submission the following will happen: * First of all, the backend will try to submit the job to one of the already available subresources. * If none of them is able to sbmit the job, the backend will check if there is a VM in pending state, and in case there is one it will raise a `RecoverableError`, thus delaying submission. * If no VM in pending state is found, the `vm_pool_max_size` configuration option is checked. If we already reached the maximum number of VM, a `UnrecoverableError` is raised. * If no VM in pending state is found but `vm_pool_max_size` is still lesser than the number of VM currently created (or it is None, which for us means no limit), then a new VM is created, and `RecoverableError` is raised. """ # Updating resource is needed to update the subresources. This # is not always done before the submit_job because of issue # nr. 386: # https://github.com/uzh/gc3pie/issues/386 self.get_resource_status() pending_vms = set(vm.id for vm in self._vmpool.get_all_vms() if vm.status in PENDING_STATES) image_id = self.get_image_id_for_job(job) # Check if the image id is valid if image_id not in [img.id for img in self._get_available_images()]: raise ConfigurationError("Image ID %s not found in cloud " "%s" % (image_id, self.os_auth_url)) instance_type = self.get_instance_type_for_job(job) if not instance_type: raise RuntimeError( "Unable to find a suitable instance type for " "application %s" % job) # First of all, try to submit to one of the subresources. for vm_id, subresource in self.subresources.items(): if not subresource.updated: # The VM is probably still booting, let's skip to the # next one and add it to the list of "pending" VMs. pending_vms.add(vm_id) continue try: # Check that the required image id and instance type # are correct vm = self._get_vm(vm_id) if vm.image['id'] != image_id: continue subresource.submit_job(job) job.execution._lrms_vm_id = vm_id job.changed = True gc3libs.log.info( "Job successfully submitted to remote resource %s.", subresource.name) return job except (LRMSSubmitError, InstanceNotFound) as ex: if gc3libs.error_ignored( # context: # - module 'openstack', # - class 'OpenStackLrms', # - method 'submit_job', # - actual error class ex.__class__.__name__, # - additional keywords 'submit', ): gc3libs.log.debug( "Ignoring error in submitting to resource '%s': %s", subresource.name, ex) else: # propagate error back to caller raise # Couldn't submit to any resource. if not pending_vms: # No pending VM, and no resource available. Create a new VM if not self.vm_pool_max_size \ or len(self._vmpool) < self.vm_pool_max_size: user_data = self.get_user_data_for_job(job) vm = self._create_instance( image_id, name="GC3Pie_%s_%d" % (self.name, (len(self._vmpool) + 1)), instance_type=instance_type, user_data=user_data) pending_vms.add(vm.id) self._vmpool.add_vm(vm) else: raise MaximumCapacityReached( "Already running the maximum number of VM on resource %s:" " %d VMs started, but max %d allowed by configuration." % (self.name, len(self._vmpool), self.vm_pool_max_size), do_log=True) # If we reached this point, we are waiting for a VM to be # ready, so delay the submission until we wither can submit to # one of the available resources or until all the VMs are # ready. gc3libs.log.debug( "No available resource was found, but some VM is still in" " `pending` state. Waiting until the next iteration before" " creating a new VM. Pending VM ids: %s", pending_vms) raise LRMSSkipSubmissionToNextIteration( "Delaying submission until one of the VMs currently pending" " is ready. Pending VM ids: %s" % str.join(', ', pending_vms))
def get_resource_status(self): self.updated = False # Since we create the resource *before* the VM is actually up # & running, it's possible that the `frontend` value of the # resources points to a non-existent hostname. Therefore, we # have to update them with valid public_ip, if they are # present. # Update status of known VMs for vm_id in self._vmpool: try: vm = self._vmpool.get_vm(vm_id, force_reload=True) except UnrecoverableError as ex: gc3libs.log.warning( "Removing stale information on VM `%s`. It has probably" " been deleted from outside GC3Pie.", vm_id) self._vmpool.remove_vm(vm_id) continue if vm.status in PENDING_STATES: # If VM is still in pending state, skip creation of # the resource continue elif vm.status in ERROR_STATES: # The VM is in error state: exit. gc3libs.log.error( "VM with id `%s` is in ERROR state." " Terminating it!", vm.id) vm.delete() self._vmpool.remove_vm(vm.id) self.subresources.pop(vm.id) continue elif vm.status == 'DELETED': gc3libs.log.info( "VM `%s` in DELETE state. It has probably been terminated" " from outside GC3Pie. Removing it from the list of VM.", vm.id) self._vmpool.remove_vm(vm.id) self.subresources.pop(vm.id) continue elif vm.status in ['SHUTOFF', 'SUSPENDED', 'RESCUE', 'VERIFY_RESIZE']: # The VM has probably ben stopped or shut down from # outside GC3Pie. gc3libs.log.error( "VM with id `%s` is in permanent state `%s`.", vm.id, vm.status) continue # Get or create a resource associated to the vm subresource = self._get_subresource(vm) try: subresource.get_resource_status() except TransportError as ex: # TODO: get all the IPs and try with all of them to connect. # Start with preferred_ip if defined gc3libs.log.info( "Ignoring error in updating resource '%s': %s." " Trying other IPs.", subresource.name, ex) for ip in sum(vm.networks.values(), []): if vm.preferred_ip == ip: continue vm.preferred_ip = ip subresource.frontend = ip gc3libs.log.info( "Connection error. Trying with alternate IP address " "%s", vm.preferred_ip) try: subresource.get_resource_status() break except Exception as ex: gc3libs.log.info( "Ignoring error in updating resource '%s': %s." " The corresponding VM may not be ready yet.", subresource.name, ex) # Unable to connect to the VM using any IP. Ensure # this resource is considered "pending" as we couldn't # update its status subresource.updated = False except Exception as ex: # XXX: Actually, we should try to identify the kind of # error we are getting. For instance, if the # configuration options `username` is wrong, we will # create VMs but we will never be able to submit jobs # to them, thus causing an increasing number of # useless VMs created on the cloud. if gc3libs.error_ignored( # context: # - module 'openstack', # - class 'OpenStackLrms', # - method 'get_resource_status', # - actual error class ex.__class__.__name__, # - additional keywords 'resource', 'status', 'update', 'vm', ): gc3libs.log.info( "Ignoring error while updating resource '%s'. " "The corresponding VM may not be ready yet. Error: %s", subresource.name, ex) else: # propagate exception back to caller raise self._vmpool.update() return self