示例#1
0
    def load_many(self, obj_ids, add=True, flush=True):
        """
        Load objects given their IDs from persistent storage.

        Return a dictionary mapping task ID to the actual
        retrieved `Task`:class: object.
        """
        tasks = {}
        for task_id in obj_ids:
            try:
                tasks[task_id] = self.load(task_id, add, flush)
            except Exception as err:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'session',
                        # - class
                        'Session',
                        # - method
                        'load',
                        # - actual error class
                        err.__class__.__name__,
                        # - additional keywords
                        'persistence',
                ):
                    gc3libs.log.warning("Ignoring error from loading '%s': %s",
                                        task_id, err)
                else:
                    # propagate exception back to caller
                    raise
        return tasks
示例#2
0
    def _load_session(self, **extra_args):
        """
        Load an existing session from disk.

        Keyword arguments are passed to the `make_store` factory
        method unchanged.

        Any error that occurs while loading jobs from disk is ignored.
        """
        try:
            store_fname = os.path.join(self.path, self.STORE_URL_FILENAME)
            self.store_url = gc3libs.utils.read_contents(store_fname).strip()
        except IOError:
            gc3libs.log.info(
                "Unable to load session: file %s is missing." % (store_fname))
            raise
        self.store = gc3libs.persistence.make_store(
            self.store_url, **extra_args)

        idx_filename = os.path.join(self.path, self.INDEX_FILENAME)
        try:
            idx_fd = open(idx_filename)
            ids = idx_fd.read().split()
            idx_fd.close()
        except:
            idx_fd.close()
            raise

        try:
            start_file = os.path.join(
                self.path, self.TIMESTAMP_FILES['start'])
            self.created = os.stat(start_file).st_mtime
        except OSError:
            gc3libs.log.warning(
                "Unable to recover starting time from existing session:"
                " file %s is missing." % (start_file))

        for task_id in ids:
            try:
                self.tasks[task_id] = self.store.load(task_id)
            except Exception as err:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'session',
                        # - class
                        'Session',
                        # - method
                        'load',
                        # - actual error class
                        err.__class__.__name__,
                        # - additional keywords
                        'persistence',
                ):
                    gc3libs.log.warning(
                        "Ignoring error from loading '%s': %s", task_id, err)
                else:
                    # propagate exception back to caller
                    raise
示例#3
0
    def _load_session(self, **extra_args):
        """
        Load an existing session from disk.

        Keyword arguments are passed to the `make_store` factory
        method unchanged.

        Any error that occurs while loading jobs from disk is ignored.
        """
        try:
            store_fname = os.path.join(self.path, self.STORE_URL_FILENAME)
            self.store_url = gc3libs.utils.read_contents(store_fname).strip()
        except IOError:
            gc3libs.log.info("Unable to load session: file %s is missing." %
                             (store_fname))
            raise
        self.store = gc3libs.persistence.make_store(self.store_url,
                                                    **extra_args)

        idx_filename = os.path.join(self.path, self.INDEX_FILENAME)
        try:
            idx_fd = open(idx_filename)
            ids = idx_fd.read().split()
            idx_fd.close()
        except:
            idx_fd.close()
            raise

        try:
            start_file = os.path.join(self.path, self.TIMESTAMP_FILES['start'])
            self.created = os.stat(start_file).st_mtime
        except OSError:
            gc3libs.log.warning(
                "Unable to recover starting time from existing session:"
                " file %s is missing." % (start_file))

        for task_id in ids:
            try:
                self.tasks[task_id] = self.store.load(task_id)
            except Exception as err:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'session',
                        # - class
                        'Session',
                        # - method
                        'load',
                        # - actual error class
                        err.__class__.__name__,
                        # - additional keywords
                        'persistence',
                ):
                    gc3libs.log.warning("Ignoring error from loading '%s': %s",
                                        task_id, err)
                else:
                    # propagate exception back to caller
                    raise
示例#4
0
    def submit_job(self, job):
        """
        Submission on an OpenStack resource will usually happen in
        multiple steps, since creating a VM and attaching a resource
        to it will take some time.

        In order to return as soon as possible, the backend will raise
        a `RecoverableError` whenever submission is delayed.

        In case a permanent error is found (for instance, we cannot
        create VMs on the cloud), a `UnrecoverableError` is raised.

        More in detail, when during submission the following will
        happen:

        * First of all, the backend will try to submit the job to one
          of the already available subresources.

        * If none of them is able to sbmit the job, the backend will
          check if there is a VM in pending state, and in case there
          is one it will raise a `RecoverableError`, thus delaying
          submission.

        * If no VM in pending state is found, the `vm_pool_max_size`
          configuration option is checked. If we already reached the
          maximum number of VM, a `UnrecoverableError` is raised.

        * If no VM in pending state is found but `vm_pool_max_size` is
          still lesser than the number of VM currently created (or it
          is None, which for us means no limit), then a new VM is
          created, and `RecoverableError` is raised.

        """
        # Updating resource is needed to update the subresources. This
        # is not always done before the submit_job because of issue
        # nr.  386:
        #     http://code.google.com/p/gc3pie/issues/detail?id=386
        self.get_resource_status()
        pending_vms = set(vm.id for vm in self._vmpool.get_all_vms() if vm.status in PENDING_STATES)

        image_id = self.get_image_id_for_job(job)
        # Check if the image id is valid
        if image_id not in [img.id for img in self._get_available_images()]:
            raise ConfigurationError("Image ID %s not found in cloud " "%s" % (image_id, self.os_auth_url))

        instance_type = self.get_instance_type_for_job(job)
        if not instance_type:
            raise RuntimeError("Unable to find a suitable instance type for " "application %s" % job)

        # First of all, try to submit to one of the subresources.
        for vm_id, subresource in self.subresources.items():
            if not subresource.updated:
                # The VM is probably still booting, let's skip to the
                # next one and add it to the list of "pending" VMs.
                pending_vms.add(vm_id)
                continue
            try:
                # Check that the required image id and instance type
                # are correct
                vm = self._get_vm(vm_id)
                if vm.image["id"] != image_id:
                    continue
                subresource.submit_job(job)
                job.os_instance_id = vm_id
                job.changed = True
                gc3libs.log.info("Job successfully submitted to remote resource %s.", subresource.name)
                return job
            except (LRMSSubmitError, InstanceNotFound) as ex:
                if gc3libs.error_ignored(
                    # context:
                    # - module
                    "openstack",
                    # - class
                    "OpenStackLrms",
                    # - method
                    "submit_job",
                    # - actual error class
                    ex.__class__.__name__,
                    # - additional keywords
                    "submit",
                ):
                    gc3libs.log.debug("Ignoring error in submitting to resource '%s': %s", subresource.name, ex)
                else:
                    # propagate error back to caller
                    raise

        # Couldn't submit to any resource.
        if not pending_vms:
            # No pending VM, and no resource available. Create a new VM
            if not self.vm_pool_max_size or len(self._vmpool) < self.vm_pool_max_size:
                user_data = self.get_user_data_for_job(job)
                vm = self._create_instance(
                    image_id,
                    name="GC3Pie_%s_%d" % (self.name, (len(self._vmpool) + 1)),
                    instance_type=instance_type,
                    user_data=user_data,
                )
                pending_vms.add(vm.id)

                self._vmpool.add_vm(vm)
            else:
                raise MaximumCapacityReached(
                    "Already running the maximum number of VM on resource %s:"
                    " %d VMs started, but max %d allowed by configuration."
                    % (self.name, len(self._vmpool), self.vm_pool_max_size),
                    do_log=True,
                )

        # If we reached this point, we are waiting for a VM to be
        # ready, so delay the submission until we wither can submit to
        # one of the available resources or until all the VMs are
        # ready.
        gc3libs.log.debug(
            "No available resource was found, but some VM is still in"
            " `pending` state. Waiting until the next iteration before"
            " creating a new VM. Pending VM ids: %s",
            pending_vms,
        )
        raise LRMSSkipSubmissionToNextIteration(
            "Delaying submission until some of the VMs currently pending"
            " is ready. Pending VM ids: %s" % str.join(", ", pending_vms)
        )
示例#5
0
    def get_resource_status(self):
        self.updated = False
        # Since we create the resource *before* the VM is actually up
        # & running, it's possible that the `frontend` value of the
        # resources points to a non-existent hostname. Therefore, we
        # have to update them with valid public_ip, if they are
        # present.

        # Update status of known VMs
        for vm_id in self._vmpool:
            try:
                vm = self._vmpool.get_vm(vm_id, force_reload=True)
            except UnrecoverableError as ex:
                gc3libs.log.warning(
                    "Removing stale information on VM `%s`. It has probably" " been deleted from outside GC3Pie.", vm_id
                )
                self._vmpool.remove_vm(vm_id)
                continue

            if vm.status in PENDING_STATES:
                # If VM is still in pending state, skip creation of
                # the resource
                continue
            elif vm.status in ERROR_STATES:
                # The VM is in error state: exit.
                gc3libs.log.error("VM with id `%s` is in ERROR state." " Terminating it!", vm.id)
                vm.delete()
                self._vmpool.remove_vm(vm.id)
                self.subresources.pop(vm.id)
                continue
            elif vm.status == "DELETED":
                gc3libs.log.info(
                    "VM `%s` in DELETE state. It has probably been terminated"
                    " from outside GC3Pie. Removing it from the list of VM.",
                    vm.id,
                )
                self._vmpool.remove_vm(vm.id)
                self.subresources.pop(vm.id)
                continue
            elif vm.status in ["SHUTOFF", "SUSPENDED", "RESCUE", "VERIFY_RESIZE"]:
                # The VM has probably ben stopped or shut down from
                # outside GC3Pie.
                gc3libs.log.error("VM with id `%s` is in permanent state `%s`.", vm.id, vm.status)
                continue

            # Get or create a resource associated to the vm
            subresource = self._get_subresource(vm)
            try:
                subresource.get_resource_status()
            except TransportError as ex:
                # TODO: get all the IPs and try with all of them to connect.
                # Start with preferred_ip if defined
                gc3libs.log.info(
                    "Ignoring error in updating resource '%s': %s." " Trying other IPs.", subresource.name, ex
                )
                for ip in sum(vm.networks.values(), []):
                    if vm.preferred_ip == ip:
                        continue
                    vm.preferred_ip = ip
                    subresource.frontend = ip
                    gc3libs.log.info("Connection error. Trying with alternate IP address " "%s", vm.preferred_ip)
                    try:
                        subresource.get_resource_status()
                        break
                    except Exception as ex:
                        gc3libs.log.info(
                            "Ignoring error in updating resource '%s': %s."
                            " The corresponding VM may not be ready yet.",
                            subresource.name,
                            ex,
                        )
                # Unable to connect to the VM using any IP.  Ensure
                # this resource is considered "pending" as we couldn't
                # update its status
                subresource.updated = False
            except Exception as ex:
                # XXX: Actually, we should try to identify the kind of
                # error we are getting. For instance, if the
                # configuration options `username` is wrong, we will
                # create VMs but we will never be able to submit jobs
                # to them, thus causing an increasing number of
                # useless VMs created on the cloud.
                if gc3libs.error_ignored(
                    # context:
                    # - module
                    "openstack",
                    # - class
                    "OpenStackLrms",
                    # - method
                    "get_resource_status",
                    # - actual error class
                    ex.__class__.__name__,
                    # - additional keywords
                    "resource",
                    "status",
                    "update",
                    "vm",
                ):
                    gc3libs.log.info(
                        "Ignoring error while updating resource '%s'. "
                        "The corresponding VM may not be ready yet. Error: %s",
                        subresource.name,
                        ex,
                    )
                else:
                    # propagate exception back to caller
                    raise
        self._vmpool.update()
        return self
示例#6
0
文件: ec2.py 项目: ewiger/gc3pie
    def submit_job(self, job):
        """
        Submission on an EC2 resource will usually happen in multiple
        steps, since creating a VM and attaching a resource to it will
        take some time.

        In order to return as soon as possible, the backend will raise
        a `RecoverableError` whenever submission is delayed.

        In case a permanent error is found (for instance, we cannot
        create VMs on the cloud), a `UnrecoverableError` is raised.

        More in detail, when during submission the following will
        happen:

        * First of all, the backend will try to submit the job to one
          of the already available subresources.

        * If none of them is able to sbmit the job, the backend will
          check if there is a VM in pending state, and in case there
          is one it will raise a `RecoverableError`, thus delaying
          submission.

        * If no VM in pending state is found, the `vm_pool_max_size`
          configuration option is checked. If we already reached the
          maximum number of VM, a `UnrecoverableError` is raised.

        * If no VM in pending state is found but `vm_pool_max_size` is
          still lesser than the number of VM currently created (or it
          is None, which for us means no limit), then a new VM is
          created, and `RecoverableError` is raised.

        """
        self._connect()
        # Updating resource is needed to update the subresources. This
        # is not always done before the submit_job because of issue
        # nr.  386:
        #     http://code.google.com/p/gc3pie/issues/detail?id=386
        self.get_resource_status()

        pending_vms = set(vm.id for vm in self._vmpool.get_all_vms()
                          if vm.state == 'pending')

        image_id = self.get_image_id_for_job(job)
        instance_type = self.get_instance_type_for_job(job)
        # Check that we can actually submit to a flavor like this
        # XXX: this check shouldn't be done by the Engine???
        if self._instance_type_specs:
            specs = self._instance_type_specs
            max_mem = specs['max_memory_per_core']
            max_cpus = specs['max_cores_per_job']
            if (job.requested_memory is not None and
                job.requested_memory > max_mem) \
                or (job.requested_cores is not None and
                    job.requested_cores > max_cpus):
                raise gc3libs.exceptions.LRMSSubmitError(
                    "EC2 flavor %s does not have enough memory/cpus "
                    "to run application %s" % (
                        self.instance_type, job.jobname))

        # First of all, try to submit to one of the subresources.
        for vm_id, resource in self.subresources.items():
            if not resource.updated:
                # The VM is probably still booting, let's skip to the
                # next one and add it to the list of "pending" VMs.
                pending_vms.add(vm_id)
                continue
            try:
                # Check that the required image id and instance type
                # are correct
                vm = self._get_vm(vm_id)
                if (vm.image_id != image_id
                        or vm.instance_type != instance_type):
                    continue
                resource.submit_job(job)
                job.ec2_instance_id = vm_id
                job.changed = True
                gc3libs.log.info(
                    "Job successfully submitted to remote resource %s.",
                    resource.name)
                return job
            except gc3libs.exceptions.LRMSSubmitError as ex:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'ec2',
                        # - class
                        'EC2Lrms',
                        # - method
                        'submit_job',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'submit',
                ):
                    gc3libs.log.debug(
                        "Ignoring error in submitting to resource %s: %s. ",
                        resource.name, ex)
                else:
                    # propagate exception to caller
                    raise

        # Couldn't submit to any resource.
        if not pending_vms:
            # No pending VM, and no resource available. Create a new VM
            if not self.vm_pool_max_size \
                    or len(self._vmpool) < self.vm_pool_max_size:
                user_data = self.get_user_data_for_job(job)
                vm = self._create_instance(image_id,
                                           instance_type=instance_type,
                                           user_data=user_data)
                pending_vms.add(vm.id)

                self._vmpool.add_vm(vm)
            else:
                raise MaximumCapacityReached(
                    "Already running the maximum number of VM on resource %s:"
                    " %d VMs started, but max %d allowed by configuration."
                    % (self.name, len(self._vmpool), self.vm_pool_max_size),
                    do_log=True)

        # If we reached this point, we are waiting for a VM to be
        # ready, so delay the submission until we wither can submit to
        # one of the available resources or until all the VMs are
        # ready.
        gc3libs.log.debug(
            "No available resource was found, but some VM is still in"
            " `pending` state. Waiting until the next iteration before"
            " creating a new VM. Pending VM ids: %s", pending_vms)
        raise LRMSSkipSubmissionToNextIteration(
            "Delaying submission until some of the VMs currently pending"
            " is ready. Pending VM ids: %s"
            % str.join(', ', pending_vms))
示例#7
0
文件: ec2.py 项目: ewiger/gc3pie
    def get_resource_status(self):
        self.updated = False
        # Since we create the resource *before* the VM is actually up
        # & running, it's possible that the `frontend` value of the
        # resources points to a non-existent hostname. Therefore, we
        # have to update them with valid public_ip, if they are
        # present.

        self._connect()
        # Update status of known VMs
        for vm_id in self._vmpool:
            try:
                vm = self._vmpool.get_vm(vm_id)
            except UnrecoverableError as ex:
                gc3libs.log.warning(
                    "Removing stale information on VM `%s`. It has probably"
                    " been deleted from outside GC3Pie.", vm_id)
                self._vmpool.remove_vm(vm_id)
                continue

            vm.update()
            if vm.state == 'pending':
                # If VM is still in pending state, skip creation of
                # the resource
                continue
            elif vm.state == 'error':
                # The VM is in error state: exit.
                gc3libs.log.error(
                    "VM with id `%s` is in ERROR state."
                    " Terminating it!", vm.id)
                vm.terminate()
                self._vmpool.remove_vm(vm.id)
            elif vm.state == 'terminated':
                gc3libs.log.info(
                    "VM `%s` in TERMINATED state. It has probably been"
                    " terminated from outside GC3Pie. Removing it from the"
                    " list of VM.", vm.id)
                self._vmpool.remove_vm(vm.id)
            elif vm.state in ['shutting-down', 'stopped']:
                # The VM has probably ben stopped or shut down from
                # outside GC3Pie.
                gc3libs.log.error(
                    "VM with id `%s` is in terminal state `%s`.",
                    vm.id, vm.state)

            # Get or create a resource associated to the vm
            resource = self._get_subresource(vm)
            try:
                resource.get_resource_status()
            except TransportError as ex:
                for ip in [vm.public_dns_name, vm.private_ip_address]:
                    if vm.preferred_ip == ip:
                        continue
                    vm.preferred_ip = ip
                    resource.frontend = ip
                    gc3libs.log.info(
                        "Connection error. Trying with secondary IP"
                        " address %s", vm.preferred_ip)
                    try:
                        resource.get_resource_status()
                        break
                    except Exception as ex:
                        # XXX: I'm exempting this from the GC3Pie
                        # `error_ignored()` policy, since this is a kind of
                        # "expected" error -- it *will* happen if the VM has
                        # not booted up yet or if we're hunting for the correct
                        # address.
                        gc3libs.log.debug(
                            "Ignoring error in updating resource %s: %s"
                            " The corresponding VM may not be ready yet.",
                            resource.name, ex)
            except Exception as ex:
                # FIXME: Actually, we should try to identify the kind of
                # error we are getting. For instance, if the
                # configuration options `username` is wrong, we will
                # create VMs but we will never be able to submit jobs
                # to them, thus causing an increasing number of
                # useless VMs created on the cloud.
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'ec2',
                        # - class
                        'EC2Lrms',
                        # - method
                        'get_resource_status',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'vm',
                ):
                    gc3libs.log.info(
                        "Ignoring error in updating resource %s: %s"
                        "The corresponding VM may not be ready yet.",
                        resource.name, ex)
                else:
                    # propagate exception to caller
                    raise
            if resource.updated:
                # Update also the instance_type specs, if not
                # already updated
                if not self._instance_type_specs:
                    specs = self._instance_type_specs
                    specs['architecture'] = resource['architecture']
                    specs['max_cores'] = resource['max_cores']
                    specs['max_cores_per_job'] = resource['max_cores_per_job']
                    specs['max_memory_per_core'] = resource['total_memory']
                    self.update(specs)

        self._vmpool.update()
        return self
示例#8
0
文件: ec2.py 项目: ewiger/gc3pie
    def _setup_security_groups(self):
        """
        Check the current configuration and set up the security group
        if it does not exist.
        """
        if not self.security_group_name:
            gc3libs.log.error("Group name in `security_group_name`"
                              " configuration option cannot be empty!")
            return
        security_groups = self._conn.get_all_security_groups()
        groups = dict((g.name, g) for g in security_groups)
        # Check if the security group exists already
        if self.security_group_name not in groups:
            try:
                gc3libs.log.info("Creating security group %s",
                                 self.security_group_name)
                security_group = self._conn.create_security_group(
                    self.security_group_name,
                    "GC3Pie_%s" % self.security_group_name)
            except Exception as ex:
                gc3libs.log.error("Error creating security group %s: %s",
                                  self.security_group_name, ex)
                raise UnrecoverableError(
                    "Error creating security group %s: %s"
                    % (self.security_group_name, ex))

            for rule in self.security_group_rules:
                try:
                    gc3libs.log.debug(
                        "Adding rule %s to security group %s.",
                        rule, self.security_group_name)
                    security_group.authorize(**rule)
                except Exception as ex:
                    if gc3libs.error_ignored(
                            # context:
                            # - module
                            'ec2',
                            # - class
                            'EC2Lrms',
                            # - method
                            'setup_security_groups',
                            # - actual error class
                            ex.__class__.__name__,
                            # - additional keywords
                            'setup',
                            'security',
                            'network',
                            'cloud',
                    ):
                        gc3libs.log.info(
                            "Ignoring error adding rule %s"
                            " to security group %s: %s",
                            rule, self.security_group_name, ex)
                    else:
                        # propagate exception to caller
                        raise
        else:
            # Check if the security group has all the rules we want
            security_group = groups[self.security_group_name]
            current_rules = []
            for rule in security_group.rules:
                rule_dict = {
                    'ip_protocol': rule.ip_protocol,
                    'from_port': int(rule.from_port),
                    'to_port': int(rule.to_port),
                    'cidr_ip': str(rule.grants[0]),
                }
                current_rules.append(rule_dict)

            for new_rule in self.security_group_rules:
                if new_rule not in current_rules:
                    security_group.authorize(**new_rule)
示例#9
0
文件: ec2.py 项目: smaffiol/gc3pie
    def submit_job(self, job):
        """
        Submission on an EC2 resource will usually happen in multiple
        steps, since creating a VM and attaching a resource to it will
        take some time.

        In order to return as soon as possible, the backend will raise
        a `RecoverableError` whenever submission is delayed.

        In case a permanent error is found (for instance, we cannot
        create VMs on the cloud), a `UnrecoverableError` is raised.

        More in detail, when during submission the following will
        happen:

        * First of all, the backend will try to submit the job to one
          of the already available subresources.

        * If none of them is able to sbmit the job, the backend will
          check if there is a VM in pending state, and in case there
          is one it will raise a `RecoverableError`, thus delaying
          submission.

        * If no VM in pending state is found, the `vm_pool_max_size`
          configuration option is checked. If we already reached the
          maximum number of VM, a `UnrecoverableError` is raised.

        * If no VM in pending state is found but `vm_pool_max_size` is
          still lesser than the number of VM currently created (or it
          is None, which for us means no limit), then a new VM is
          created, and `RecoverableError` is raised.

        """
        self._connect()
        # Updating resource is needed to update the subresources. This
        # is not always done before the submit_job because of issue
        # nr.  386:
        #     https://github.com/uzh/gc3pie/issues/386
        self.get_resource_status()

        pending_vms = set(vm.id for vm in self._vmpool.get_all_vms()
                          if vm.state == 'pending')

        image_id = self.get_image_id_for_job(job)
        instance_type = self.get_instance_type_for_job(job)
        # Check that we can actually submit to a flavor like this
        # XXX: this check shouldn't be done by the Engine???
        if self._instance_type_specs:
            specs = self._instance_type_specs
            max_mem = specs['max_memory_per_core']
            max_cpus = specs['max_cores_per_job']
            if (job.requested_memory is not None and
                job.requested_memory > max_mem) \
                or (job.requested_cores is not None and
                    job.requested_cores > max_cpus):
                raise gc3libs.exceptions.LRMSSubmitError(
                    "EC2 flavor %s does not have enough memory/cpus "
                    "to run application %s" %
                    (self.instance_type, job.jobname))

        # First of all, try to submit to one of the subresources.
        for vm_id, resource in self.subresources.items():
            if not resource.updated:
                # The VM is probably still booting, let's skip to the
                # next one and add it to the list of "pending" VMs.
                pending_vms.add(vm_id)
                continue
            try:
                # Check that the required image id and instance type
                # are correct
                vm = self._get_vm(vm_id)
                if (vm.image_id != image_id
                        or vm.instance_type != instance_type):
                    continue
                resource.submit_job(job)
                job.execution._lrms_vm_id = vm_id
                job.changed = True
                gc3libs.log.info(
                    "Job successfully submitted to remote resource %s.",
                    resource.name)
                return job
            except gc3libs.exceptions.LRMSSubmitError as ex:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'ec2',
                        # - class
                        'EC2Lrms',
                        # - method
                        'submit_job',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'submit',
                ):
                    gc3libs.log.debug(
                        "Ignoring error in submitting to resource %s: %s. ",
                        resource.name, ex)
                else:
                    # propagate exception to caller
                    raise

        # Couldn't submit to any resource.
        if not pending_vms:
            # No pending VM, and no resource available. Create a new VM
            if not self.vm_pool_max_size \
                    or len(self._vmpool) < self.vm_pool_max_size:
                user_data = self.get_user_data_for_job(job)
                vm = self._create_instance(image_id,
                                           instance_type=instance_type,
                                           user_data=user_data)
                pending_vms.add(vm.id)

                self._vmpool.add_vm(vm)
            else:
                raise MaximumCapacityReached(
                    "Already running the maximum number of VM on resource %s:"
                    " %d VMs started, but max %d allowed by configuration." %
                    (self.name, len(self._vmpool), self.vm_pool_max_size),
                    do_log=True)

        # If we reached this point, we are waiting for a VM to be
        # ready, so delay the submission until we wither can submit to
        # one of the available resources or until all the VMs are
        # ready.
        gc3libs.log.debug(
            "No available resource was found, but some VM is still in"
            " `pending` state. Waiting until the next iteration before"
            " creating a new VM. Pending VM ids: %s", pending_vms)
        raise ResourceNotReady(
            "Delaying submission until one of the VMs currently pending"
            " is ready. (Pending VM ids: %r)" % (pending_vms, ))
示例#10
0
文件: ec2.py 项目: smaffiol/gc3pie
    def get_resource_status(self):
        self.updated = False
        # Since we create the resource *before* the VM is actually up
        # & running, it's possible that the `frontend` value of the
        # resources points to a non-existent hostname. Therefore, we
        # have to update them with valid public_ip, if they are
        # present.

        self._connect()
        # Update status of known VMs
        for vm_id in self._vmpool:
            try:
                vm = self._vmpool.get_vm(vm_id)
            except UnrecoverableError as ex:
                gc3libs.log.warning(
                    "Removing stale information on VM `%s`. It has probably"
                    " been deleted from outside GC3Pie.", vm_id)
                self._vmpool.remove_vm(vm_id)
                continue

            vm.update()
            if vm.state == 'pending':
                # If VM is still in pending state, skip creation of
                # the resource
                continue
            elif vm.state == 'error':
                # The VM is in error state: exit.
                gc3libs.log.error(
                    "VM with id `%s` is in ERROR state."
                    " Terminating it!", vm.id)
                vm.terminate()
                self._vmpool.remove_vm(vm.id)
            elif vm.state == 'terminated':
                gc3libs.log.info(
                    "VM `%s` in TERMINATED state. It has probably been"
                    " terminated from outside GC3Pie. Removing it from the"
                    " list of VM.", vm.id)
                self._vmpool.remove_vm(vm.id)
            elif vm.state in ['shutting-down', 'stopped']:
                # The VM has probably ben stopped or shut down from
                # outside GC3Pie.
                gc3libs.log.error("VM with id `%s` is in terminal state `%s`.",
                                  vm.id, vm.state)

            # Get or create a resource associated to the vm
            resource = self._get_subresource(vm)
            try:
                resource.get_resource_status()
            except TransportError as ex:
                for ip in [vm.public_dns_name, vm.private_ip_address]:
                    if vm.preferred_ip == ip:
                        continue
                    vm.preferred_ip = ip
                    resource.frontend = ip
                    gc3libs.log.info(
                        "Connection error. Trying with secondary IP"
                        " address %s", vm.preferred_ip)
                    try:
                        resource.get_resource_status()
                        break
                    except Exception as ex:
                        # XXX: I'm exempting this from the GC3Pie
                        # `error_ignored()` policy, since this is a kind of
                        # "expected" error -- it *will* happen if the VM has
                        # not booted up yet or if we're hunting for the correct
                        # address.
                        gc3libs.log.debug(
                            "Ignoring error in updating resource %s: %s"
                            " The corresponding VM may not be ready yet.",
                            resource.name, ex)
            except Exception as ex:
                # FIXME: Actually, we should try to identify the kind of
                # error we are getting. For instance, if the
                # configuration options `username` is wrong, we will
                # create VMs but we will never be able to submit jobs
                # to them, thus causing an increasing number of
                # useless VMs created on the cloud.
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'ec2',
                        # - class
                        'EC2Lrms',
                        # - method
                        'get_resource_status',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'vm',
                ):
                    gc3libs.log.info(
                        "Ignoring error in updating resource %s: %s"
                        "The corresponding VM may not be ready yet.",
                        resource.name, ex)
                else:
                    # propagate exception to caller
                    raise
            if resource.updated:
                # Update also the instance_type specs, if not
                # already updated
                if not self._instance_type_specs:
                    specs = self._instance_type_specs
                    specs['architecture'] = resource['architecture']
                    specs['max_cores'] = resource['max_cores']
                    specs['max_cores_per_job'] = resource['max_cores_per_job']
                    specs['max_memory_per_core'] = resource['total_memory']
                    self.update(specs)

        self._vmpool.update()
        return self
示例#11
0
文件: ec2.py 项目: smaffiol/gc3pie
    def _setup_security_groups(self):
        """
        Check the current configuration and set up the security group
        if it does not exist.
        """
        if not self.security_group_name:
            gc3libs.log.error("Group name in `security_group_name`"
                              " configuration option cannot be empty!")
            return
        security_groups = self._conn.get_all_security_groups()
        groups = dict((g.name, g) for g in security_groups)
        # Check if the security group exists already
        if self.security_group_name not in groups:
            try:
                gc3libs.log.info("Creating security group %s",
                                 self.security_group_name)
                security_group = self._conn.create_security_group(
                    self.security_group_name,
                    "GC3Pie_%s" % self.security_group_name)
            except Exception as ex:
                gc3libs.log.error("Error creating security group %s: %s",
                                  self.security_group_name, ex)
                raise UnrecoverableError(
                    "Error creating security group %s: %s" %
                    (self.security_group_name, ex))

            for rule in self.security_group_rules:
                try:
                    gc3libs.log.debug("Adding rule %s to security group %s.",
                                      rule, self.security_group_name)
                    security_group.authorize(**rule)
                except Exception as ex:
                    if gc3libs.error_ignored(
                            # context:
                            # - module
                            'ec2',
                            # - class
                            'EC2Lrms',
                            # - method
                            'setup_security_groups',
                            # - actual error class
                            ex.__class__.__name__,
                            # - additional keywords
                            'setup',
                            'security',
                            'network',
                            'cloud',
                    ):
                        gc3libs.log.info(
                            "Ignoring error adding rule %s"
                            " to security group %s: %s", rule,
                            self.security_group_name, ex)
                    else:
                        # propagate exception to caller
                        raise
        else:
            # Check if the security group has all the rules we want
            security_group = groups[self.security_group_name]
            current_rules = []
            for rule in security_group.rules:
                rule_dict = {
                    'ip_protocol': rule.ip_protocol,
                    'from_port': int(rule.from_port),
                    'to_port': int(rule.to_port),
                    'cidr_ip': str(rule.grants[0]),
                }
                current_rules.append(rule_dict)

            for new_rule in self.security_group_rules:
                if new_rule not in current_rules:
                    security_group.authorize(**new_rule)
示例#12
0
    def submit_job(self, job):
        """
        Submission on an OpenStack resource will usually happen in
        multiple steps, since creating a VM and attaching a resource
        to it will take some time.

        In order to return as soon as possible, the backend will raise
        a `RecoverableError` whenever submission is delayed.

        In case a permanent error is found (for instance, we cannot
        create VMs on the cloud), a `UnrecoverableError` is raised.

        More in detail, when during submission the following will
        happen:

        * First of all, the backend will try to submit the job to one
          of the already available subresources.

        * If none of them is able to sbmit the job, the backend will
          check if there is a VM in pending state, and in case there
          is one it will raise a `RecoverableError`, thus delaying
          submission.

        * If no VM in pending state is found, the `vm_pool_max_size`
          configuration option is checked. If we already reached the
          maximum number of VM, a `UnrecoverableError` is raised.

        * If no VM in pending state is found but `vm_pool_max_size` is
          still lesser than the number of VM currently created (or it
          is None, which for us means no limit), then a new VM is
          created, and `RecoverableError` is raised.

        """
        # Updating resource is needed to update the subresources. This
        # is not always done before the submit_job because of issue
        # nr.  386:
        #     https://github.com/uzh/gc3pie/issues/386
        self.get_resource_status()
        pending_vms = set(vm.id for vm in self._vmpool.get_all_vms()
                          if vm.status in PENDING_STATES)

        image_id = self.get_image_id_for_job(job)
        # Check if the image id is valid
        if image_id not in [img.id for img in self._get_available_images()]:
            raise ConfigurationError("Image ID %s not found in cloud "
                                     "%s" % (image_id, self.os_auth_url))

        instance_type = self.get_instance_type_for_job(job)
        if not instance_type:
            raise RuntimeError(
                "Unable to find a suitable instance type for "
                "application %s" % job)

        # First of all, try to submit to one of the subresources.
        for vm_id, subresource in self.subresources.items():
            if not subresource.updated:
                # The VM is probably still booting, let's skip to the
                # next one and add it to the list of "pending" VMs.
                pending_vms.add(vm_id)
                continue
            try:
                # Check that the required image id and instance type
                # are correct
                vm = self._get_vm(vm_id)
                if vm.image['id'] != image_id:
                    continue
                subresource.submit_job(job)
                job.execution._lrms_vm_id = vm_id
                job.changed = True
                gc3libs.log.info(
                    "Job successfully submitted to remote resource %s.",
                    subresource.name)
                return job
            except (LRMSSubmitError, InstanceNotFound) as ex:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'openstack',
                        # - class
                        'OpenStackLrms',
                        # - method
                        'submit_job',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'submit',
                ):
                    gc3libs.log.debug(
                        "Ignoring error in submitting to resource '%s': %s",
                        subresource.name, ex)
                else:
                    # propagate error back to caller
                    raise

        # Couldn't submit to any resource.
        if not pending_vms:
            # No pending VM, and no resource available. Create a new VM
            if not self.vm_pool_max_size \
                    or len(self._vmpool) < self.vm_pool_max_size:
                user_data = self.get_user_data_for_job(job)
                vm = self._create_instance(
                    image_id,
                    name="GC3Pie_%s_%d" % (self.name, (len(self._vmpool) + 1)),
                    instance_type=instance_type,
                    user_data=user_data)
                pending_vms.add(vm.id)

                self._vmpool.add_vm(vm)
            else:
                raise MaximumCapacityReached(
                    "Already running the maximum number of VM on resource %s:"
                    " %d VMs started, but max %d allowed by configuration."
                    % (self.name, len(self._vmpool), self.vm_pool_max_size),
                    do_log=True)

        # If we reached this point, we are waiting for a VM to be
        # ready, so delay the submission until we wither can submit to
        # one of the available resources or until all the VMs are
        # ready.
        gc3libs.log.debug(
            "No available resource was found, but some VM is still in"
            " `pending` state. Waiting until the next iteration before"
            " creating a new VM. Pending VM ids: %s", pending_vms)
        raise LRMSSkipSubmissionToNextIteration(
            "Delaying submission until one of the VMs currently pending"
            " is ready. Pending VM ids: %s"
            % str.join(', ', pending_vms))
示例#13
0
    def get_resource_status(self):
        self.updated = False
        # Since we create the resource *before* the VM is actually up
        # & running, it's possible that the `frontend` value of the
        # resources points to a non-existent hostname. Therefore, we
        # have to update them with valid public_ip, if they are
        # present.

        # Update status of known VMs
        for vm_id in self._vmpool:
            try:
                vm = self._vmpool.get_vm(vm_id, force_reload=True)
            except UnrecoverableError as ex:
                gc3libs.log.warning(
                    "Removing stale information on VM `%s`. It has probably"
                    " been deleted from outside GC3Pie.", vm_id)
                self._vmpool.remove_vm(vm_id)
                continue

            if vm.status in PENDING_STATES:
                # If VM is still in pending state, skip creation of
                # the resource
                continue
            elif vm.status in ERROR_STATES:
                # The VM is in error state: exit.
                gc3libs.log.error(
                    "VM with id `%s` is in ERROR state."
                    " Terminating it!", vm.id)
                vm.delete()
                self._vmpool.remove_vm(vm.id)
                self.subresources.pop(vm.id)
                continue
            elif vm.status == 'DELETED':
                gc3libs.log.info(
                    "VM `%s` in DELETE state. It has probably been terminated"
                    " from outside GC3Pie. Removing it from the list of VM.",
                    vm.id)
                self._vmpool.remove_vm(vm.id)
                self.subresources.pop(vm.id)
                continue
            elif vm.status in ['SHUTOFF', 'SUSPENDED',
                               'RESCUE', 'VERIFY_RESIZE']:
                # The VM has probably ben stopped or shut down from
                # outside GC3Pie.
                gc3libs.log.error(
                    "VM with id `%s` is in permanent state `%s`.",
                    vm.id, vm.status)
                continue

            # Get or create a resource associated to the vm
            subresource = self._get_subresource(vm)
            try:
                subresource.get_resource_status()
            except TransportError as ex:
                # TODO: get all the IPs and try with all of them to connect.
                # Start with preferred_ip if defined
                gc3libs.log.info(
                    "Ignoring error in updating resource '%s': %s."
                    " Trying other IPs.", subresource.name, ex)
                for ip in sum(vm.networks.values(), []):
                    if vm.preferred_ip == ip:
                        continue
                    vm.preferred_ip = ip
                    subresource.frontend = ip
                    gc3libs.log.info(
                        "Connection error. Trying with alternate IP address "
                        "%s", vm.preferred_ip)
                    try:
                        subresource.get_resource_status()
                        break
                    except Exception as ex:
                        gc3libs.log.info(
                            "Ignoring error in updating resource '%s': %s."
                            " The corresponding VM may not be ready yet.",
                            subresource.name, ex)
                # Unable to connect to the VM using any IP.  Ensure
                # this resource is considered "pending" as we couldn't
                # update its status
                subresource.updated = False
            except Exception as ex:
                # XXX: Actually, we should try to identify the kind of
                # error we are getting. For instance, if the
                # configuration options `username` is wrong, we will
                # create VMs but we will never be able to submit jobs
                # to them, thus causing an increasing number of
                # useless VMs created on the cloud.
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'openstack',
                        # - class
                        'OpenStackLrms',
                        # - method
                        'get_resource_status',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'resource',
                        'status',
                        'update',
                        'vm',
                ):
                    gc3libs.log.info(
                        "Ignoring error while updating resource '%s'. "
                        "The corresponding VM may not be ready yet. Error: %s",
                        subresource.name, ex)
                else:
                    # propagate exception back to caller
                    raise
        self._vmpool.update()
        return self