示例#1
0
    def _load_plugins(self):
        config_plugins = api.plugins
        extension_manager = enabled.EnabledExtensionManager(
            check_func=lambda ext: ext.name in config_plugins,
            namespace='broker.execution.plugins',
            invoke_on_load=True)

        for ext in extension_manager.extensions:
            if ext.name in self.plugins:
                raise ex.ConfigurationError(
                    ("Plugin with name '%s' already exists.") % ext.name)
            ext.obj.name = ext.name
            self.plugins[ext.name] = ext.obj
            LOG.log("Plugin {plugin_name} loaded {entry_point}".format(
                plugin_name=ext.name, entry_point=ext.entry_point_target))

        if len(self.plugins) < len(config_plugins):
            loaded_plugins = set(six.iterkeys(self.plugins))
            requested_plugins = set(config_plugins)
            raise ex.ConfigurationError(
                ("Plugins couldn't be loaded: %s") %
                ", ".join(requested_plugins - loaded_plugins))
示例#2
0
    def _validate_settings(self, instance_name=None):
        """Load and validate provider settings

        Each provider's settings must include an instances list with specific instance
        details.
        General provider settings should live on the top level for that provider.
        One instance should have a "default" key set to True

        :param instance_name: A string matching an instance name
        """
        section_name = self.__class__.__name__.upper()
        # make sure each instance isn't loading values from another
        fresh_settings = settings.get_fresh(section_name)
        instance, default = None, False
        for candidate in fresh_settings.instances:
            if instance_name in candidate:
                instance = candidate
                default = False
            elif (candidate.values()[0].get("default")
                  or len(fresh_settings.instances) == 1):
                instance = candidate
                default = True
        fresh_settings.update(instance.values()[0])
        if default:
            # if a default provider is selected, defer to loaded environment variables
            # settings[section_name] = fresh_settings
            # settings.execute_loaders(loaders=[dynaconf.loaders.env_loader])
            # ideal solution above. However, need to workaround until
            # https://github.com/rochacbruno/dynaconf/issues/511
            settings.execute_loaders()
            for key in fresh_settings.keys():
                if key in settings[section_name]:
                    fresh_settings[key] = settings[section_name][key]
            settings[section_name] = fresh_settings
        else:
            settings[section_name] = fresh_settings

        # temporary workaround for https://github.com/rochacbruno/dynaconf/issues/508
        # remove the current valiators, add ours, and validate
        # then add the other validators back in and move on
        current_validators = settings.validators[:]
        settings.validators.clear()
        settings.validators.extend(self._validators)
        try:
            settings.validators.validate()
        except dynaconf.ValidationError as err:
            raise exceptions.ConfigurationError(err)
        settings.validators.extend(current_validators)
示例#3
0
    def start_application(self, data, spark_applications_ids, app_id):
        try:
            self.update_application_state("Running")

            # Broker Parameters
            cluster_id = None
            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key
            key_path = api.key_path
            log_path = api.log_path
            container = api.container
            hosts = api.hosts
            remote_hdfs = api.remote_hdfs
            swift_logdir = api.swift_logdir
            number_of_attempts = api.number_of_attempts
            dummy_opportunistic = api.dummy_opportunistic

            # User Request Parameters
            net_id = data['net_id']
            master_ng = data['master_ng']
            slave_ng = data['slave_ng']
            op_slave_ng = data['opportunistic_slave_ng']
            opportunism = str(data['opportunistic'])
            plugin = data['openstack_plugin']
            percentage = int(data['percentage'])
            job_type = data['job_type']
            version = data['version']
            args = data['args']
            main_class = data['main_class']
            dependencies = data['dependencies']
            job_template_name = data['job_template_name']
            job_binary_name = data['job_binary_name']
            job_binary_url = data['job_binary_url']
            image_id = data['image_id']
            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            collect_period = data['collect_period']
            number_of_jobs = data['number_of_jobs']
            image_id = data['image_id']
            starting_cap = data['starting_cap']

            # Optimizer Parameters
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            # Openstack Components
            connector = os_connector.OpenStackConnector(plugin_log)

            sahara = connector.get_sahara_client(user, password, project_id,
                                                 auth_ip, domain)

            swift = connector.get_swift_client(user, password, project_id,
                                               auth_ip, domain)

            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            # Optimizer gets the vcpu size of flavor
            cores_per_slave = connector.get_vcpus_by_nodegroup(
                nova, sahara, slave_ng)

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)

            if cores <= 0:
                if 'cluster_size' in data.keys():
                    req_cluster_size = data['cluster_size']
                else:
                    self._log("""%s | 'cluster_size' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()
            else:
                req_cluster_size = int(
                    math.ceil(cores / float(cores_per_slave)))

            # Check Oportunism
            if opportunism == "True":
                self._log("""%s | Checking if opportunistic instances
                          are available""" % (time.strftime("%H:%M:%S")))

                pred_cluster_size = optimizer.get_cluster_size(
                    api.optimizer_url, hosts, percentage, dummy_opportunistic)
            else:
                pred_cluster_size = req_cluster_size

            if pred_cluster_size > req_cluster_size:
                cluster_size = pred_cluster_size
            else:
                cluster_size = req_cluster_size

            self._log("%s | Cluster size: %s" %
                      (time.strftime("%H:%M:%S"), str(cluster_size)))

            self._log("%s | Creating cluster..." % (time.strftime("%H:%M:%S")))

            cluster_id = self._create_cluster(sahara, connector,
                                              req_cluster_size,
                                              pred_cluster_size, public_key,
                                              net_id, image_id, plugin,
                                              version, master_ng, slave_ng,
                                              op_slave_ng)

            self._log("%s | Cluster id: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            swift_path = self._is_swift_path(args)

            if cluster_id:
                master = connector.get_master_instance(
                    sahara, cluster_id)['internal_ip']

                self._log("%s | Master is %s" %
                          (time.strftime("%H:%M:%S"), master))

                workers = connector.get_worker_instances(sahara, cluster_id)
                workers_id = []

                for worker in workers:
                    workers_id.append(worker['instance_id'])

                self._log("%s | Configuring controller" %
                          (time.strftime("%H:%M:%S")))

                controller.setup_environment(api.controller_url, workers_id,
                                             starting_cap, data)

                if swift_path:
                    job_status = self._swift_spark_execution(
                        master, key_path, sahara, connector, job_binary_name,
                        job_binary_url, user, password, job_template_name,
                        job_type, plugin, cluster_size, args, main_class,
                        cluster_id, spark_applications_ids, workers_id, app_id,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, log_path, swift, container, data,
                        number_of_attempts)
                else:
                    job_status = self._hdfs_spark_execution(
                        master, remote_hdfs, key_path, args, job_binary_url,
                        main_class, dependencies, spark_applications_ids,
                        expected_time, monitor_plugin, collect_period,
                        number_of_jobs, workers_id, data, connector, swift,
                        swift_logdir, container, number_of_attempts)

            else:
                # FIXME: exception type
                self.update_application_state("Error")
                raise ex.ClusterNotCreatedException()

            # Delete cluster
            self._log("%s | Delete cluster: %s" %
                      (time.strftime("%H:%M:%S"), cluster_id))

            connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Finished application execution" %
                      (time.strftime("%H:%M:%S")))

            return job_status

        except KeyError as ke:
            self._log("%s | Parameter missing in submission: %s, "
                      "please check the config file" %
                      (time.strftime("%H:%M:%S"), str(ke)))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except ex.ConfigurationError:
            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except SaharaAPIException:
            self._log("%s | There is not enough resource to create a cluster" %
                      (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")

        except Exception:
            if cluster_id is not None:
                self._log("%s | Delete cluster: %s" %
                          (time.strftime("%H:%M:%S"), cluster_id))
                connector.delete_cluster(sahara, cluster_id)

            self._log("%s | Unknown error, please report to administrators "
                      "of WP3 infrastructure" % (time.strftime("%H:%M:%S")))

            self._log("%s | Finished application execution with error" %
                      (time.strftime("%H:%M:%S")))

            self.update_application_state("Error")
示例#4
0
    def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args,
                              job_bin_url, main_class, dependencies,
                              spark_applications_ids, expected_time,
                              monitor_plugin, collect_period, number_of_jobs,
                              workers_id, data, connector, swift, swift_logdir,
                              container, number_of_attempts):

        job_exec_id = str(uuid.uuid4())[0:7]
        self._log("%s | Job execution ID: %s" %
                  (time.strftime("%H:%M:%S"), job_exec_id))

        # Defining params
        local_path = '/tmp/spark-jobs/' + job_exec_id + '/'
        # remote_path = 'ubuntu@' + master + ':' + local_path

        job_input_paths, job_output_path, job_params = (hdfs.get_job_params(
            key_path, remote_hdfs, args))

        job_binary_path = hdfs.get_path(job_bin_url)

        # Create temporary job directories
        self._log("%s | Create temporary job directories" %
                  (time.strftime("%H:%M:%S")))
        self._mkdir(local_path)

        # Create cluster directories
        self._log("%s | Creating cluster directories" %
                  (time.strftime("%H:%M:%S")))
        remote.execute_command(master, key_path, 'mkdir -p %s' % local_path)

        # Get job binary from hdfs
        self._log("%s | Get job binary from hdfs" %
                  (time.strftime("%H:%M:%S")))
        remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path,
                              local_path)

        # Enabling event log on cluster
        self._log("%s | Enabling event log on cluster" %
                  (time.strftime("%H:%M:%S")))
        self._enable_event_log(master, key_path, local_path)

        # Submit job
        self._log("%s | Starting job" % (time.strftime("%H:%M:%S")))

        local_binary_file = (
            local_path + remote.list_directory(key_path, master, local_path))

        spark_job = self._submit_job(master, key_path, main_class,
                                     dependencies, local_binary_file, args)

        spark_app_id = spark.get_running_app(master, spark_applications_ids,
                                             number_of_attempts)

        if spark_app_id is None:
            self._log("%s | Error on submission of application, "
                      "please check the config file" %
                      (time.strftime("%H:%M:%S")))

            (output, err) = spark_job.communicate()
            self.stdout.log(output)
            self.stderr.log(err)

            raise ex.ConfigurationError()

        spark_applications_ids.append(spark_app_id)

        info_plugin = {
            "spark_submisson_url": "http://" + master,
            "expected_time": expected_time,
            "number_of_jobs": number_of_jobs
        }

        self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S")))
        monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin,
                              info_plugin, collect_period)
        self._log("%s | Starting controller" % (time.strftime("%H:%M:%S")))
        controller.start_controller(api.controller_url, spark_app_id,
                                    workers_id, data)

        (output, err) = spark_job.communicate()

        self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S")))
        monitor.stop_monitor(api.monitor_url, spark_app_id)
        self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S")))
        controller.stop_controller(api.controller_url, spark_app_id)

        self.stdout.log(output)
        self.stderr.log(err)

        self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S")))
        event_log_path = local_path + 'eventlog/'
        self._mkdir(event_log_path)

        remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path,
                                                    spark_app_id)

        remote.copy(key_path, remote_event_log_path, event_log_path)

        self._log("%s | Upload log to Swift" % (time.strftime("%H:%M:%S")))
        connector.upload_directory(swift, event_log_path, swift_logdir,
                                   container)

        spark_applications_ids.remove(spark_app_id)

        self.update_application_state("OK")

        return 'OK'
示例#5
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")
            plugin_log.log("%s | Starting application execution" %
                           (time.strftime("%H:%M:%S")))

            binary_url = str(data['binary_url'])
            execution_class = str(data['execution_class'])
            execution_parameters = str(data['execution_parameters'])
            expected_time = int(data['expected_time'])
            number_of_jobs = int(data['number_of_jobs'])
            starting_cap = int(data['starting_cap'])

            # Optimizer integration
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)
            optimizer_command = ''
            if cores >= 0:
                optimizer_command = ' --total-executor-cores %d ' % cores

            plugin_log.log("%s | Submission id: %s" %
                           (time.strftime("%H:%M:%S"), self.app_id))

            plugin_log.log("%s | Connecting with Mesos cluster..." %
                           (time.strftime("%H:%M:%S")))

            conn = ssh.get_connection(api.mesos_url, api.cluster_username,
                                      api.cluster_password,
                                      api.cluster_key_path)

            plugin_log.log("%s | Connected with Mesos cluster" %
                           (time.strftime("%H:%M:%S")))

            # Execute all the spark needed commands
            # to run an spark job from command line
            if execution_class != "" and execution_class is not None:
                # If the class field is empty, it means that the
                # job binary is python
                binary_path = '~/exec_bin.jar'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '--class %s %s %s')
            else:
                binary_path = '~/exec_bin.py'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '%s %s %s')

            plugin_log.log("%s | Download the binary to cluster" %
                           (time.strftime("%H:%M:%S")))

            try:
                stdin, stdout, stderr = conn.exec_command(
                    'wget %s -O %s' % (binary_url, binary_path))

                plugin_log.log("%s | Waiting for download the binary..." %
                               (time.strftime("%H:%M:%S")))

                # TODO: Fix possible wget error
                stdout.read()
                plugin_log.log("%s | Binary downloaded" %
                               (time.strftime("%H:%M:%S")))

            except Exception as e:
                plugin_log.log("%s | Error downloading binary" %
                               (time.strftime("%H:%M:%S")))
                self.update_application_state("Error")
                return "Error"

            i, o, e = conn.exec_command(
                spark_run %
                (api.spark_path, self.app_id, api.mesos_url, api.mesos_port,
                 execution_class, binary_path, execution_parameters))

            # Discovery ips of the executors from Mesos
            # and discovery the ids on KVM using the ips
            list_vms_one = (
                'onevm list --user %s --password %s --endpoint %s' %
                (api.one_username, api.one_password, api.one_url))

            stdin, stdout, stderr = conn.exec_command(list_vms_one)

            list_response = stdout.read()

            vms_ips, master = mesos.get_executors_ip(conn, self.frameworks_url,
                                                     self.app_id)
            plugin_log.log("%s | Master: %s" %
                           (time.strftime("%H:%M:%S"), master))

            plugin_log.log("%s | Executors: %s" %
                           (time.strftime("%H:%M:%S"), vms_ips))

            vms_ids = mesos.extract_vms_ids(list_response)
            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), vms_ids))

            executors_vms_ids = []
            for ip in vms_ips:
                for id in vms_ids:
                    vm_info_one = (
                        'onevm show %s '
                        '--user %s '
                        '--password %s '
                        '--endpoint %s' %
                        (id, api.one_username, api.one_password, api.one_url))

                    stdin, stdout, stderr = conn.exec_command(vm_info_one)
                    if ip in stdout.read():
                        executors_vms_ids.append(id)
                        break

            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), executors_vms_ids))

            # Set up the initial configuration of cpu cap
            controller.setup_environment(api.controller_url, executors_vms_ids,
                                         starting_cap, data)

            info_plugin = {
                "spark_submisson_url": master,
                "expected_time": expected_time,
                "number_of_jobs": number_of_jobs
            }

            plugin_log.log("%s | Starting monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.start_monitor(api.monitor_url, self.app_id, 'spark-mesos',
                                  info_plugin, 2)

            plugin_log.log("%s | Starting controller" %
                           (time.strftime("%H:%M:%S")))
            controller.start_controller(api.controller_url, self.app_id,
                                        executors_vms_ids, data)

            # This command locks the plugin execution
            # until the execution be done
            print o.read()

            plugin_log.log("%s | Stopping monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.stop_monitor(api.monitor_url, self.app_id)

            plugin_log.log("%s | Stopping controller" %
                           (time.strftime("%H:%M:%S")))
            controller.stop_controller(api.controller_url, self.app_id)

            plugin_log.log("%s | Remove binaries" %
                           (time.strftime("%H:%M:%S")))
            conn.exec_command('rm -rf ~/exec_bin.*')

            plugin_log.log("%s | Finished application execution" %
                           (time.strftime("%H:%M:%S")))

            self.update_application_state("OK")
            return 'OK'

        except Exception as e:
            plugin_log.log(e.message)
            print e.message
            self.update_application_state("Error")
示例#6
0
    def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args,
                              job_bin_url, main_class, dependencies,
                              spark_applications_ids, number_of_attempts):

        job_exec_id = str(uuid.uuid4())[0:7]
        self._log("%s | Job execution ID: %s" %
                  (time.strftime("%H:%M:%S"), job_exec_id))

        # Defining params
        local_path = '/tmp/spark-jobs/' + job_exec_id + '/'

        job_binary_path = hdfs.get_path(job_bin_url)

        # Create temporary job directories
        self._log("%s | Create temporary job directories" %
                  time.strftime("%H:%M:%S"))
        self._mkdir(local_path)

        # Create cluster directories
        self._log("%s | Creating cluster directories" %
                  time.strftime("%H:%M:%S"))
        remote.execute_command(master, key_path, 'mkdir -p %s' % local_path)

        # Get job binary from hdfs
        self._log("%s | Get job binary from hdfs" % time.strftime("%H:%M:%S"))
        remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path,
                              local_path)

        # Enabling event log on cluster
        self._log("%s | Enabling event log on cluster" %
                  time.strftime("%H:%M:%S"))
        self._enable_event_log(master, key_path, local_path)

        # Submit job
        self._log("%s | Starting job" % time.strftime("%H:%M:%S"))

        local_binary_file = (
            local_path + remote.list_directory(key_path, master, local_path))

        spark_job = self._submit_job(master, key_path, main_class,
                                     dependencies, local_binary_file, args)

        spark_app_id = spark.get_running_app(master, spark_applications_ids,
                                             number_of_attempts)

        if spark_app_id is None:
            self._log("%s | Error on submission of application, "
                      "please check the config file" %
                      time.strftime("%H:%M:%S"))

            (output, err) = spark_job.communicate()
            self.stdout.log(output)
            self.stderr.log(err)

            raise ex.ConfigurationError()

        spark_applications_ids.append(spark_app_id)

        (output, err) = spark_job.communicate()

        self.stdout.log(output)
        self.stderr.log(err)

        self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S")))
        event_log_path = local_path + 'eventlog/'
        self._mkdir(event_log_path)

        remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path,
                                                    spark_app_id)

        remote.copy(key_path, remote_event_log_path, event_log_path)

        spark_applications_ids.remove(spark_app_id)

        self.update_application_state("OK")

        return 'OK'