def delete_job_resources(self): try: if self.enable_visualizer: visualizer.stop_visualization(api.visualizer_url, self.app_id, self.data['visualizer_info']) monitor.stop_monitor(api.monitor_url, self.app_id) controller.stop_controller(api.controller_url, self.app_id) self.visualizer_url = "Url is dead!" KUBEJOBS_LOG.log("Stoped services") # delete redis resources if not self.get_application_state() == 'terminated': self.k8s.terminate_job(self.app_id) except Exception: KUBEJOBS_LOG.log("Job " + self.app_id + " resources already deleted!") self.del_resources_authorization = False self.persist_state()
def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args, job_bin_url, main_class, dependencies, spark_applications_ids, expected_time, monitor_plugin, collect_period, number_of_jobs, workers_id, data, connector, swift, swift_logdir, container, number_of_attempts): job_exec_id = str(uuid.uuid4())[0:7] self._log("%s | Job execution ID: %s" % (time.strftime("%H:%M:%S"), job_exec_id)) # Defining params local_path = '/tmp/spark-jobs/' + job_exec_id + '/' # remote_path = 'ubuntu@' + master + ':' + local_path job_input_paths, job_output_path, job_params = (hdfs.get_job_params( key_path, remote_hdfs, args)) job_binary_path = hdfs.get_path(job_bin_url) # Create temporary job directories self._log("%s | Create temporary job directories" % (time.strftime("%H:%M:%S"))) self._mkdir(local_path) # Create cluster directories self._log("%s | Creating cluster directories" % (time.strftime("%H:%M:%S"))) remote.execute_command(master, key_path, 'mkdir -p %s' % local_path) # Get job binary from hdfs self._log("%s | Get job binary from hdfs" % (time.strftime("%H:%M:%S"))) remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path, local_path) # Enabling event log on cluster self._log("%s | Enabling event log on cluster" % (time.strftime("%H:%M:%S"))) self._enable_event_log(master, key_path, local_path) # Submit job self._log("%s | Starting job" % (time.strftime("%H:%M:%S"))) local_binary_file = ( local_path + remote.list_directory(key_path, master, local_path)) spark_job = self._submit_job(master, key_path, main_class, dependencies, local_binary_file, args) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) if spark_app_id is None: self._log("%s | Error on submission of application, " "please check the config file" % (time.strftime("%H:%M:%S"))) (output, err) = spark_job.communicate() self.stdout.log(output) self.stderr.log(err) raise ex.ConfigurationError() spark_applications_ids.append(spark_app_id) info_plugin = { "spark_submisson_url": "http://" + master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin, info_plugin, collect_period) self._log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, spark_app_id, workers_id, data) (output, err) = spark_job.communicate() self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, spark_app_id) self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, spark_app_id) self.stdout.log(output) self.stderr.log(err) self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S"))) event_log_path = local_path + 'eventlog/' self._mkdir(event_log_path) remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path, spark_app_id) remote.copy(key_path, remote_event_log_path, event_log_path) self._log("%s | Upload log to Swift" % (time.strftime("%H:%M:%S"))) connector.upload_directory(swift, event_log_path, swift_logdir, container) spark_applications_ids.remove(spark_app_id) self.update_application_state("OK") return 'OK'
def _swift_spark_execution(self, master, key_path, sahara, connector, job_binary_name, job_binary_url, user, password, job_template_name, job_type, plugin, cluster_size, args, main_class, cluster_id, spark_applications_ids, workers_id, app_id, expected_time, monitor_plugin, collect_period, number_of_jobs, log_path, swift, container, data, number_of_attempts): # Preparing job job_binary_id = self._get_job_binary_id(sahara, connector, job_binary_name, job_binary_url, user, password) mains = [job_binary_id] job_template_id = self._get_job_template_id(sahara, connector, mains, job_template_name, job_type) self._log("%s | Starting job..." % (time.strftime("%H:%M:%S"))) # Running job # What is os_utils? # configs = os_utils.get_job_config(connector, plugin, # cluster_size, user, password, # args, main_class) configs = None job = connector.create_job_execution(sahara, job_template_id, cluster_id, configs=configs) self._log("%s | Created job" % (time.strftime("%H:%M:%S"))) spark_app_id = spark.get_running_app(master, spark_applications_ids, number_of_attempts) spark_applications_ids.append(spark_app_id) self._log("%s | Spark app id" % (time.strftime("%H:%M:%S"))) job_exec_id = job.id for worker_id in workers_id: instances_log.log("%s|%s" % (app_id, worker_id)) job_status = connector.get_job_status(sahara, job_exec_id) self._log("%s | Sahara job status: %s" % (time.strftime("%H:%M:%S"), job_status)) info_plugin = { "spark_submisson_url": "http://" + master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin, info_plugin, collect_period) self._log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, spark_app_id, workers_id, data) job_status = self._wait_on_job_finish(sahara, connector, job_exec_id, app_id) self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, spark_app_id) self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, spark_app_id) spark_applications_ids.remove(spark_app_id) self._log("Finished application execution") if connector.is_job_completed(job_status): self.update_application_state("OK") if connector.is_job_failed(job_status): self.update_application_state("Error") return job_status
def start_application(self, data): try: self.update_application_state("Running") plugin_log.log("%s | Starting application execution" % (time.strftime("%H:%M:%S"))) binary_url = str(data['binary_url']) execution_class = str(data['execution_class']) execution_parameters = str(data['execution_parameters']) expected_time = int(data['expected_time']) number_of_jobs = int(data['number_of_jobs']) starting_cap = int(data['starting_cap']) # Optimizer integration app_name = data['app_name'] days = 0 if app_name.lower() == 'bulma': if 'days' in data.keys(): days = data['days'] else: self._log("""%s | 'days' parameter missing""" % (time.strftime("%H:%M:%S"))) raise ex.ConfigurationError() cores, vms = optimizer.get_info(api.optimizer_url, expected_time, app_name, days) optimizer_command = '' if cores >= 0: optimizer_command = ' --total-executor-cores %d ' % cores plugin_log.log("%s | Submission id: %s" % (time.strftime("%H:%M:%S"), self.app_id)) plugin_log.log("%s | Connecting with Mesos cluster..." % (time.strftime("%H:%M:%S"))) conn = ssh.get_connection(api.mesos_url, api.cluster_username, api.cluster_password, api.cluster_key_path) plugin_log.log("%s | Connected with Mesos cluster" % (time.strftime("%H:%M:%S"))) # Execute all the spark needed commands # to run an spark job from command line if execution_class != "" and execution_class is not None: # If the class field is empty, it means that the # job binary is python binary_path = '~/exec_bin.jar' spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' + optimizer_command + '--class %s %s %s') else: binary_path = '~/exec_bin.py' spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' + optimizer_command + '%s %s %s') plugin_log.log("%s | Download the binary to cluster" % (time.strftime("%H:%M:%S"))) try: stdin, stdout, stderr = conn.exec_command( 'wget %s -O %s' % (binary_url, binary_path)) plugin_log.log("%s | Waiting for download the binary..." % (time.strftime("%H:%M:%S"))) # TODO: Fix possible wget error stdout.read() plugin_log.log("%s | Binary downloaded" % (time.strftime("%H:%M:%S"))) except Exception as e: plugin_log.log("%s | Error downloading binary" % (time.strftime("%H:%M:%S"))) self.update_application_state("Error") return "Error" i, o, e = conn.exec_command( spark_run % (api.spark_path, self.app_id, api.mesos_url, api.mesos_port, execution_class, binary_path, execution_parameters)) # Discovery ips of the executors from Mesos # and discovery the ids on KVM using the ips list_vms_one = ( 'onevm list --user %s --password %s --endpoint %s' % (api.one_username, api.one_password, api.one_url)) stdin, stdout, stderr = conn.exec_command(list_vms_one) list_response = stdout.read() vms_ips, master = mesos.get_executors_ip(conn, self.frameworks_url, self.app_id) plugin_log.log("%s | Master: %s" % (time.strftime("%H:%M:%S"), master)) plugin_log.log("%s | Executors: %s" % (time.strftime("%H:%M:%S"), vms_ips)) vms_ids = mesos.extract_vms_ids(list_response) plugin_log.log("%s | Executors IDs: %s" % (time.strftime("%H:%M:%S"), vms_ids)) executors_vms_ids = [] for ip in vms_ips: for id in vms_ids: vm_info_one = ( 'onevm show %s ' '--user %s ' '--password %s ' '--endpoint %s' % (id, api.one_username, api.one_password, api.one_url)) stdin, stdout, stderr = conn.exec_command(vm_info_one) if ip in stdout.read(): executors_vms_ids.append(id) break plugin_log.log("%s | Executors IDs: %s" % (time.strftime("%H:%M:%S"), executors_vms_ids)) # Set up the initial configuration of cpu cap controller.setup_environment(api.controller_url, executors_vms_ids, starting_cap, data) info_plugin = { "spark_submisson_url": master, "expected_time": expected_time, "number_of_jobs": number_of_jobs } plugin_log.log("%s | Starting monitor" % (time.strftime("%H:%M:%S"))) monitor.start_monitor(api.monitor_url, self.app_id, 'spark-mesos', info_plugin, 2) plugin_log.log("%s | Starting controller" % (time.strftime("%H:%M:%S"))) controller.start_controller(api.controller_url, self.app_id, executors_vms_ids, data) # This command locks the plugin execution # until the execution be done print o.read() plugin_log.log("%s | Stopping monitor" % (time.strftime("%H:%M:%S"))) monitor.stop_monitor(api.monitor_url, self.app_id) plugin_log.log("%s | Stopping controller" % (time.strftime("%H:%M:%S"))) controller.stop_controller(api.controller_url, self.app_id) plugin_log.log("%s | Remove binaries" % (time.strftime("%H:%M:%S"))) conn.exec_command('rm -rf ~/exec_bin.*') plugin_log.log("%s | Finished application execution" % (time.strftime("%H:%M:%S"))) self.update_application_state("OK") return 'OK' except Exception as e: plugin_log.log(e.message) print e.message self.update_application_state("Error")
def start_application(self, data): try: self.update_application_state("Running") user = api.user password = api.password project_id = api.project_id auth_ip = api.auth_ip domain = api.domain public_key = api.public_key connector = os_connector.OpenStackConnector(LOG) nova = connector.get_nova_client(user, password, project_id, auth_ip, domain) monitor_plugin = data['monitor_plugin'] expected_time = data['expected_time'] log_path = data['log_path'] image_id = data['image_id'] flavor_id = data['flavor_id'] command = data['command'] cluster_size = data['cluster_size'] starting_cap = data['scaling_parameters']["starting_cap"] app_start_time = 0 app_end_time = 0 LOG.log("Creating instance(s)") print "Creating instance(s)..." # Create a number of instances to run the application based on # cluster_size, image_id, flavor_id and public_key instances = self._create_instances(nova, connector, image_id, flavor_id, public_key, cluster_size) LOG.log("Waiting until instance become active...") print "Waiting until instance become active..." # Retrive network information from all instances when they # reach ACTIVE state instances_nets = [] for instance_id in instances: instance_status = connector.get_instance_status( nova, instance_id) while instance_status != 'ACTIVE': instance_status = connector.get_instance_status( nova, instance_id) instance_ips = connector.get_instance_networks( nova, instance_id) instances_nets.append(instance_ips) time.sleep(5) time.sleep(30) LOG.log("Checking if ssh is available") print "Checking if ssh is available" # Verify if ssh is available for any ip address for each instance instances_ips = [] for instance_net in instances_nets: for net_ip_list in instance_net.values(): for ip in net_ip_list: attempts = 2 while attempts != -1: try: conn = self._get_ssh_connection( ip, api.key_path) instances_ips.append(ip) attempts = -1 except Exception as e: LOG.log("Fail to connect") LOG.log(e.message) print "Fail to connect" print e.message attempts -= 1 time.sleep(30) LOG.log("Setting up environment") print "Setting up environment" # Set CPU cap in all instances controller.setup_environment(api.controller_url, instances, starting_cap, data) # Execute application and start monitor and controller service. applications = [] for ip in instances_ips: LOG.log("Executing commands into the instance") print "Executing commands into the instance" # TODO Check if exec_command will work without blocking exec conn = self._get_ssh_connection(ip, api.key_path) conn.exec_command(command) app_start_time = time.time() app_id = "app-os-generic" + str(uuid.uuid4())[:8] applications.append(app_id) monitor_plugin = monitor_plugin info_plugin = { "host_ip": ip, "log_path": log_path, "expected_time": expected_time } collect_period = 1 try: LOG.log("Starting monitoring") print "Starting monitoring" monitor.start_monitor(api.monitor_url, app_id, monitor_plugin, info_plugin, collect_period) LOG.log("Starting scaling") print "Starting scaling" controller.start_controller(api.controller_url, app_id, instances, data) except Exception as e: LOG.log(e.message) print e.message # Stop monitor and controller when each application stops application_running = True while application_running: status_instances = [] for instance_id in instances: status = connector.get_instance_status(nova, instance_id) status_instances.append(status) if self._instances_down(status_instances): application_running = False app_end_time = time.time() LOG.log("Application finished") print "Application finished" for app_id in applications: LOG.log("Stopping monitoring") print "Stopping monitoring" monitor.stop_monitor(api.monitor_url, app_id) LOG.log("Stopping scaling") print "Stopping scaling" controller.stop_controller(api.controller_url, app_id) else: instance_status = [] time.sleep(2) LOG.log("Removing instances...") print "Removing instances..." # Remove instances after the end of all applications self._remove_instances(nova, connector, instances) LOG.log("Finished application execution") print "Finished application execution" application_time = app_end_time - app_start_time application_time_log.log( "%s|%.0f|%.0f" % (app_id, app_start_time, application_time)) self.application_time = application_time self.start_time = app_start_time self.update_application_state("OK") return str(application_time) except Exception as e: LOG.log(e.message) print e.message self.update_application_state("Error")
def start_application(self, data): try: # Download files that contains the items jobs = requests.get(data['redis_workload']).text.\ split('\n')[:-1] # Provision a redis database for the job. Die in case of error. # TODO(clenimar): configure ``timeout`` via a request param, # e.g. api.redis_creation_timeout. redis_ip, redis_port = self.k8s.provision_redis_or_die(self.app_id) #agent_port = k8s.create_cpu_agent(self.app_id) # inject REDIS_HOST in the environment data['env_vars']['REDIS_HOST'] = 'redis-%s' % self.app_id # inject SCONE_CONFIG_ID in the environment # FIXME: make SCONE_CONFIG_ID optional in submission data['env_vars']['SCONE_CONFIG_ID'] = data['config_id'] # create a new Redis client and fill the work queue if (self.rds == None): self.rds = redis.StrictRedis(host=redis_ip, port=redis_port) queue_size = len(jobs) # Check if a visualizer will be created self.enable_visualizer = data['enable_visualizer'] # Create all visualizer components if self.enable_visualizer: # Specify the datasource to be used in the visualization datasource_type = data['visualizer_info']['datasource_type'] if datasource_type == "influxdb": database_data = k8s.create_influxdb(self.app_id) #TODO {javan} change name of redis_ip to node_ip in configuration file database_data.update({"url": api.redis_ip}) data['monitor_info'].update( {'database_data': database_data}) data['visualizer_info'].update( {'database_data': database_data}) data['monitor_info'].update( {'datasource_type': datasource_type}) print "Creating Visualization plataform" data['visualizer_info'].update({ 'enable_visualizer': data['enable_visualizer'], 'plugin': data['monitor_plugin'], 'visualizer_plugin': data['visualizer_plugin'], 'username': data['username'], 'password': data['password'] }) visualizer.start_visualization(api.visualizer_url, self.app_id, data['visualizer_info']) self.visualizer_url = visualizer.get_visualizer_url( api.visualizer_url, self.app_id) print "Dashboard of the job created on: %s" % ( self.visualizer_url) print "Creating Redis queue" for job in jobs: self.rds.rpush("job", job) print "Creating Job" self.k8s.create_job(self.app_id, data['cmd'], data['img'], data['init_size'], data['env_vars'], config_id=data["config_id"]) starting_time = datetime.datetime.now().\ strftime('%Y-%m-%dT%H:%M:%S.%fGMT') # Starting monitor data['monitor_info'].update({ 'count_jobs_url': api.count_queue, 'number_of_jobs': queue_size, 'submission_time': starting_time, 'redis_ip': redis_ip, 'redis_port': redis_port, 'enable_visualizer': self.enable_visualizer }) #, #'cpu_agent_port': agent_port}) monitor.start_monitor(api.monitor_url, self.app_id, data['monitor_plugin'], data['monitor_info'], 2) # Starting controller data.update({'redis_ip': redis_ip, 'redis_port': redis_port}) controller.start_controller_k8s(api.controller_url, self.app_id, data) while not self.job_completed and not self.terminated: self.update_application_state("ongoing") self.job_completed = self.k8s.completed(self.app_id) time.sleep(1) # Stop monitor, controller and visualizer if (self.get_application_state() == "ongoing"): self.update_application_state("completed") print "Job finished" time.sleep(float(30)) if self.enable_visualizer: visualizer.stop_visualization(api.visualizer_url, self.app_id, data['visualizer_info']) monitor.stop_monitor(api.monitor_url, self.app_id) controller.stop_controller(api.controller_url, self.app_id) print "Stoped services" # delete redis resources if not self.get_application_state() == 'terminated': self.k8s.delete_redis_resources(self.app_id) except Exception as ex: self.update_application_state("error") print "ERROR: %s" % ex print "Application finished."
def start_application(self, data): try: # Download files that contains the items jobs = requests.get(data['redis_workload']).text.\ split('\n')[:-1] # Provision a redis database for the job. Die in case of error. # TODO(clenimar): configure ``timeout`` via a request param, # e.g. api.redis_creation_timeout. redis_ip, redis_port = k8s.provision_redis_or_die(self.app_id) self.rds = redis.StrictRedis(host=redis_ip, port=redis_port) queue_size = len(jobs) print "Creating Redis queue" for job in jobs: self.rds.rpush("job", job) print "Creating Job" k8s.create_job(self.app_id, data['cmd'], data['img'], data['init_size'], data['env_vars'], config_id=data["config_id"]) starting_time = datetime.datetime.now().\ strftime('%Y-%m-%dT%H:%M:%S.%fGMT') # Starting monitor data['monitor_info'].update({ 'count_jobs_url': api.count_queue, 'number_of_jobs': queue_size, 'submission_time': starting_time, 'redis_ip': redis_ip, 'redis_port': redis_port, 'graphic_metrics': data['graphic_metrics'] }) monitor.start_monitor(api.monitor_url, self.app_id, data['monitor_plugin'], data['monitor_info'], 2) # Starting controller data.update({'redis_ip': redis_ip, 'redis_port': redis_port}) controller.start_controller_k8s(api.controller_url, self.app_id, data) while not self.job_completed and not self.terminated: self.update_application_state("ongoing") self.job_completed = k8s.completed(self.app_id) time.sleep(1) # Stop monitor and controller if (self.get_application_state() == "ongoing"): self.update_application_state("completed") print "job finished" monitor.stop_monitor(api.monitor_url, self.app_id) controller.stop_controller(api.controller_url, self.app_id) print "stoped services" # delete redis resources time.sleep(float(30)) if not self.get_application_state() == 'terminated': k8s.delete_redis_resources(self.app_id) except Exception as ex: self.update_application_state("error") print "ERROR: %s" % ex print "Application finished."