def _load_plugins(self): config_plugins = api.plugins extension_manager = enabled.EnabledExtensionManager( check_func=lambda ext: ext.name in config_plugins, namespace='broker.execution.plugins', invoke_on_load=True) for ext in extension_manager.extensions: if ext.name in self.plugins: raise ex.ConfigurationError( ("Plugin with name '%s' already exists.") % ext.name) ext.obj.name = ext.name self.plugins[ext.name] = ext.obj LOG.log("Plugin {plugin_name} loaded {entry_point}".format( plugin_name=ext.name, entry_point=ext.entry_point_target)) if len(self.plugins) < len(config_plugins): loaded_plugins = set(six.iterkeys(self.plugins)) requested_plugins = set(config_plugins) raise ex.ConfigurationError( ("Plugins couldn't be loaded: %s") % ", ".join(requested_plugins - loaded_plugins))
def _validate_settings(self, instance_name=None): """Load and validate provider settings Each provider's settings must include an instances list with specific instance details. General provider settings should live on the top level for that provider. One instance should have a "default" key set to True :param instance_name: A string matching an instance name """ section_name = self.__class__.__name__.upper() # make sure each instance isn't loading values from another fresh_settings = settings.get_fresh(section_name) instance, default = None, False for candidate in fresh_settings.instances: if instance_name in candidate: instance = candidate default = False elif (candidate.values()[0].get("default") or len(fresh_settings.instances) == 1): instance = candidate default = True fresh_settings.update(instance.values()[0]) if default: # if a default provider is selected, defer to loaded environment variables # settings[section_name] = fresh_settings # settings.execute_loaders(loaders=[dynaconf.loaders.env_loader]) # ideal solution above. However, need to workaround until # https://github.com/rochacbruno/dynaconf/issues/511 settings.execute_loaders() for key in fresh_settings.keys(): if key in settings[section_name]: fresh_settings[key] = settings[section_name][key] settings[section_name] = fresh_settings else: settings[section_name] = fresh_settings # temporary workaround for https://github.com/rochacbruno/dynaconf/issues/508 # remove the current valiators, add ours, and validate # then add the other validators back in and move on current_validators = settings.validators[:] settings.validators.clear() settings.validators.extend(self._validators) try: settings.validators.validate() except dynaconf.ValidationError as err: raise exceptions.ConfigurationError(err) settings.validators.extend(current_validators)
def start_application(self, data, spark_applications_ids, app_id): try: self.update_application_state("Running") # Broker Parameters cluster_id = None user = api.user password = api.password project_id = api.project_id auth_ip = api.auth_ip domain = api.domain public_key = api.public_key key_path = api.key_path log_path = api.log_path container = api.container hosts = api.hosts remote_hdfs = api.remote_hdfs swift_logdir = api.swift_logdir number_of_attempts = api.number_of_attempts dummy_opportunistic = api.dummy_opportunistic # User Request Parameters net_id = data['net_id'] master_ng = data['master_ng'] slave_ng = data['slave_ng'] op_slave_ng = data['opportunistic_slave_ng'] opportunism = str(data['opportunistic']) plugin = data['openstack_plugin'] percentage = int(data['percentage']) job_type = data['job_type'] version = data['version'] args = data['args'] main_class = data['main_class'] dependencies = data['dependencies'] job_template_name = data['job_template_name'] job_binary_name = data['job_binary_name'] job_binary_url = data['job_binary_url'] image_id = data['image_id'] monitor_plugin = data['monitor_plugin'] expected_time = data['expected_time'] collect_period = data['collect_period'] number_of_jobs = data['number_of_jobs'] image_id = data['image_id'] starting_cap = data['starting_cap'] # Optimizer Parameters app_name = data['app_name'] days = 0 if app_name.lower() == 'bulma': if 'days' in data.keys(): days = data['days'] else: self._log("""%s | 'days' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() # Openstack Components connector = os_connector.OpenStackConnector(plugin_log) sahara = connector.get_sahara_client(user, password, project_id, auth_ip, domain) swift = connector.get_swift_client(user, password, project_id, auth_ip, domain) nova = connector.get_nova_client(user, password, project_id, auth_ip, domain) # Optimizer gets the vcpu size of flavor cores_per_slave = connector.get_vcpus_by_nodegroup( nova, sahara, slave_ng) cores, vms = optimizer.get_info(api.optimizer_url, expected_time, app_name, days) if cores <= 0: if 'cluster_size' in data.keys(): req_cluster_size = data['cluster_size'] else: self._log("""%s | 'cluster_size' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() else: req_cluster_size = int( math.ceil(cores / float(cores_per_slave))) # Check Oportunism if opportunism == "True": self._log("""%s | Checking if opportunistic instances are available""" % (time.strftime("%H:%M:%S"))) pred_cluster_size = optimizer.get_cluster_size( api.optimizer_url, hosts, percentage, dummy_opportunistic) else: pred_cluster_size = req_cluster_size if pred_cluster_size > req_cluster_size: cluster_size = pred_cluster_size else: cluster_size = req_cluster_size self._log("%s | Cluster size: %s" % (time.strftime("%H:%M:%S"), str(cluster_size))) self._log("%s | Creating cluster..." % (time.strftime("%H:%M:%S"))) cluster_id = self._create_cluster(sahara, connector, req_cluster_size, pred_cluster_size, public_key, net_id, image_id, plugin, version, master_ng, slave_ng, op_slave_ng) self._log("%s | Cluster id: %s" % (time.strftime("%H:%M:%S"), cluster_id)) swift_path = self._is_swift_path(args) if cluster_id: master = connector.get_master_instance( sahara, cluster_id)['internal_ip'] self._log("%s | Master is %s" % (time.strftime("%H:%M:%S"), master)) workers = connector.get_worker_instances(sahara, cluster_id) workers_id = [] for worker in workers: workers_id.append(worker['instance_id']) self._log("%s | Configuring controller" % (time.strftime("%H:%M:%S"))) controller.setup_environment(api.controller_url, workers_id, starting_cap, data) if swift_path: job_status = self._swift_spark_execution( master, key_path, sahara, connector, job_binary_name, job_binary_url, user, password, job_template_name, job_type, plugin, cluster_size, args, main_class, cluster_id, spark_applications_ids, workers_id, app_id, expected_time, monitor_plugin, collect_period, number_of_jobs, log_path, swift, container, data, number_of_attempts) else: job_status = self._hdfs_spark_execution( master, remote_hdfs, key_path, args, job_binary_url, main_class, dependencies, spark_applications_ids, expected_time, monitor_plugin, collect_period, number_of_jobs, workers_id, data, connector, swift, swift_logdir, container, number_of_attempts) else: # FIXME: exception type self.update_application_state("Error") raise ex.ClusterNotCreatedException() # Delete cluster self._log("%s | Delete cluster: %s" % (time.strftime("%H:%M:%S"), cluster_id)) connector.delete_cluster(sahara, cluster_id) self._log("%s | Finished application execution" % (time.strftime("%H:%M:%S"))) return job_status except KeyError as ke: self._log("%s | Parameter missing in submission: %s, " "please check the config file" % (time.strftime("%H:%M:%S"), str(ke))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except ex.ConfigurationError: self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except SaharaAPIException: self._log("%s | There is not enough resource to create a cluster" % (time.strftime("%H:%M:%S"))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") except Exception: if cluster_id is not None: self._log("%s | Delete cluster: %s" % (time.strftime("%H:%M:%S"), cluster_id)) connector.delete_cluster(sahara, cluster_id) self._log("%s | Unknown error, please report to administrators " "of WP3 infrastructure" % (time.strftime("%H:%M:%S"))) self._log("%s | Finished application execution with error" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error")
def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args, job_bin_url, main_class, dependencies, spark_applications_ids, expected_time, monitor_plugin, collect_period, number_of_jobs, workers_id, data, connector, swift, swift_logdir, container, number_of_attempts): job_exec_id = str(uuid.uuid4())[0:7] self._log("%s | Job execution ID: %s" % (time.strftime("%H:%M:%S"), job_exec_id)) # Defining params local_path = '/tmp/spark-jobs/' + job_exec_id + '/' # remote_path = 'ubuntu@' + master + ':' + local_path job_input_paths, job_output_path, job_params = (hdfs.get_job_params( key_path, remote_hdfs, args)) job_binary_path = hdfs.get_path(job_bin_url) # Create temporary job directories self._log("%s | Create temporary job directories" % (time.strftime("%H:%M:%S"))) self._mkdir(local_path) # Create cluster directories self._log("%s | Creating cluster directories" % (time.strftime("%H:%M:%S"))) remote.execute_command(master, key_path, 'mkdir -p %s' % local_path) # Get job binary from hdfs self._log("%s | Get job binary from hdfs" % (time.strftime("%H:%M:%S"))) remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path, local_path) # Enabling event log on cluster self._log("%s | Enabling event log on cluster" % (time.strftime("%H:%M:%S"))) self._enable_event_log(master, key_path, local_path) # Submit job self._log("%s | Starting job" % (time.strftime("%H:%M:%S"))) local_binary_file = ( local_path + remote.list_directory(key_path, master, local_path)) spark_job = self._submit_job(master, key_path, main_class, dependencies, local_binary_file, args) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) if spark_app_id is None: self._log("%s | Error on submission of application, " "please check the config file" % (time.strftime("%H:%M:%S"))) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) raise ex.ConfigurationError() spark_applications_ids.append(spark_app_id) info_plugin = { "spark_submisson_url": "http://" + master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin, info_plugin, collect_period) self._log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, spark_app_id, workers_id, data) (output, err) = spark_job.communicate() self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, spark_app_id) self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, spark_app_id) self.stdout.log(output) self.stderr.log(err) self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S"))) event_log_path = local_path + 'eventlog/' self._mkdir(event_log_path) remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path, spark_app_id) remote.copy(key_path, remote_event_log_path, event_log_path) self._log("%s | Upload log to Swift" % (time.strftime("%H:%M:%S"))) connector.upload_directory(swift, event_log_path, swift_logdir, container) spark_applications_ids.remove(spark_app_id) self.update_application_state("OK") return 'OK'
def start_application(self, data): try: self.update_application_state("Running") plugin_log.log("%s | Starting application execution" % (time.strftime("%H:%M:%S"))) binary_url = str(data['binary_url']) execution_class = str(data['execution_class']) execution_parameters = str(data['execution_parameters']) expected_time = int(data['expected_time']) number_of_jobs = int(data['number_of_jobs']) starting_cap = int(data['starting_cap']) # Optimizer integration app_name = data['app_name'] days = 0 if app_name.lower() == 'bulma': if 'days' in data.keys(): days = data['days'] else: self._log("""%s | 'days' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() cores, vms = optimizer.get_info(api.optimizer_url, expected_time, app_name, days) optimizer_command = '' if cores >= 0: optimizer_command = ' --total-executor-cores %d ' % cores plugin_log.log("%s | Submission id: %s" % (time.strftime("%H:%M:%S"), self.app_id)) plugin_log.log("%s | Connecting with Mesos cluster..." % (time.strftime("%H:%M:%S"))) conn = ssh.get_connection(api.mesos_url, api.cluster_username, api.cluster_password, api.cluster_key_path) plugin_log.log("%s | Connected with Mesos cluster" % (time.strftime("%H:%M:%S"))) # Execute all the spark needed commands # to run an spark job from command line if execution_class != "" and execution_class is not None: # If the class field is empty, it means that the # job binary is python binary_path = '~/exec_bin.jar' spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' + optimizer_command + '--class %s %s %s') else: binary_path = '~/exec_bin.py' spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' + optimizer_command + '%s %s %s') plugin_log.log("%s | Download the binary to cluster" % (time.strftime("%H:%M:%S"))) try: stdin, stdout, stderr = conn.exec_command( 'wget %s -O %s' % (binary_url, binary_path)) plugin_log.log("%s | Waiting for download the binary..." % (time.strftime("%H:%M:%S"))) # TODO: Fix possible wget error stdout.read() plugin_log.log("%s | Binary downloaded" % (time.strftime("%H:%M:%S"))) except Exception as e: plugin_log.log("%s | Error downloading binary" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") return "Error" i, o, e = conn.exec_command( spark_run % (api.spark_path, self.app_id, api.mesos_url, api.mesos_port, execution_class, binary_path, execution_parameters)) # Discovery ips of the executors from Mesos # and discovery the ids on KVM using the ips list_vms_one = ( 'onevm list --user %s --password %s --endpoint %s' % (api.one_username, api.one_password, api.one_url)) stdin, stdout, stderr = conn.exec_command(list_vms_one) list_response = stdout.read() vms_ips, master = mesos.get_executors_ip(conn, self.frameworks_url, self.app_id) plugin_log.log("%s | Master: %s" % (time.strftime("%H:%M:%S"), master)) plugin_log.log("%s | Executors: %s" % (time.strftime("%H:%M:%S"), vms_ips)) vms_ids = mesos.extract_vms_ids(list_response) plugin_log.log("%s | Executors IDs: %s" % (time.strftime("%H:%M:%S"), vms_ids)) executors_vms_ids = [] for ip in vms_ips: for id in vms_ids: vm_info_one = ( 'onevm show %s ' '--user %s ' '--password %s ' '--endpoint %s' % (id, api.one_username, api.one_password, api.one_url)) stdin, stdout, stderr = conn.exec_command(vm_info_one) if ip in stdout.read(): executors_vms_ids.append(id) break plugin_log.log("%s | Executors IDs: %s" % (time.strftime("%H:%M:%S"), executors_vms_ids)) # Set up the initial configuration of cpu cap controller.setup_environment(api.controller_url, executors_vms_ids, starting_cap, data) info_plugin = { "spark_submisson_url": master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } plugin_log.log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, self.app_id, 'spark-mesos', info_plugin, 2) plugin_log.log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, self.app_id, executors_vms_ids, data) # This command locks the plugin execution # until the execution be done print o.read() plugin_log.log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, self.app_id) plugin_log.log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, self.app_id) plugin_log.log("%s | Remove binaries" % (time.strftime("%H:%M:%S"))) conn.exec_command('rm -rf ~/exec_bin.*') plugin_log.log("%s | Finished application execution" % (time.strftime("%H:%M:%S"))) self.update_application_state("OK") return 'OK' except Exception as e: plugin_log.log(e.message) print e.message self.update_application_state("Error")
def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args, job_bin_url, main_class, dependencies, spark_applications_ids, number_of_attempts): job_exec_id = str(uuid.uuid4())[0:7] self._log("%s | Job execution ID: %s" % (time.strftime("%H:%M:%S"), job_exec_id)) # Defining params local_path = '/tmp/spark-jobs/' + job_exec_id + '/' job_binary_path = hdfs.get_path(job_bin_url) # Create temporary job directories self._log("%s | Create temporary job directories" % time.strftime("%H:%M:%S")) self._mkdir(local_path) # Create cluster directories self._log("%s | Creating cluster directories" % time.strftime("%H:%M:%S")) remote.execute_command(master, key_path, 'mkdir -p %s' % local_path) # Get job binary from hdfs self._log("%s | Get job binary from hdfs" % time.strftime("%H:%M:%S")) remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path, local_path) # Enabling event log on cluster self._log("%s | Enabling event log on cluster" % time.strftime("%H:%M:%S")) self._enable_event_log(master, key_path, local_path) # Submit job self._log("%s | Starting job" % time.strftime("%H:%M:%S")) local_binary_file = ( local_path + remote.list_directory(key_path, master, local_path)) spark_job = self._submit_job(master, key_path, main_class, dependencies, local_binary_file, args) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) if spark_app_id is None: self._log("%s | Error on submission of application, " "please check the config file" % time.strftime("%H:%M:%S")) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) raise ex.ConfigurationError() spark_applications_ids.append(spark_app_id) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S"))) event_log_path = local_path + 'eventlog/' self._mkdir(event_log_path) remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path, spark_app_id) remote.copy(key_path, remote_event_log_path, event_log_path) spark_applications_ids.remove(spark_app_id) self.update_application_state("OK") return 'OK'