Exemplo n.º 1
0
    def delete_job_resources(self):
        try:
            if self.enable_visualizer:
                visualizer.stop_visualization(api.visualizer_url, self.app_id,
                                              self.data['visualizer_info'])

            monitor.stop_monitor(api.monitor_url, self.app_id)
            controller.stop_controller(api.controller_url, self.app_id)

            self.visualizer_url = "Url is dead!"
            KUBEJOBS_LOG.log("Stoped services")

            # delete redis resources
            if not self.get_application_state() == 'terminated':
                self.k8s.terminate_job(self.app_id)
        except Exception:
            KUBEJOBS_LOG.log("Job " + self.app_id +
                             " resources already deleted!")
        self.del_resources_authorization = False
        self.persist_state()
Exemplo n.º 2
0
    def _hdfs_spark_execution(self, master, remote_hdfs, key_path, args,
                              job_bin_url, main_class, dependencies,
                              spark_applications_ids, expected_time,
                              monitor_plugin, collect_period, number_of_jobs,
                              workers_id, data, connector, swift, swift_logdir,
                              container, number_of_attempts):

        job_exec_id = str(uuid.uuid4())[0:7]
        self._log("%s | Job execution ID: %s" %
                  (time.strftime("%H:%M:%S"), job_exec_id))

        # Defining params
        local_path = '/tmp/spark-jobs/' + job_exec_id + '/'
        # remote_path = 'ubuntu@' + master + ':' + local_path

        job_input_paths, job_output_path, job_params = (hdfs.get_job_params(
            key_path, remote_hdfs, args))

        job_binary_path = hdfs.get_path(job_bin_url)

        # Create temporary job directories
        self._log("%s | Create temporary job directories" %
                  (time.strftime("%H:%M:%S")))
        self._mkdir(local_path)

        # Create cluster directories
        self._log("%s | Creating cluster directories" %
                  (time.strftime("%H:%M:%S")))
        remote.execute_command(master, key_path, 'mkdir -p %s' % local_path)

        # Get job binary from hdfs
        self._log("%s | Get job binary from hdfs" %
                  (time.strftime("%H:%M:%S")))
        remote.copy_from_hdfs(master, key_path, remote_hdfs, job_binary_path,
                              local_path)

        # Enabling event log on cluster
        self._log("%s | Enabling event log on cluster" %
                  (time.strftime("%H:%M:%S")))
        self._enable_event_log(master, key_path, local_path)

        # Submit job
        self._log("%s | Starting job" % (time.strftime("%H:%M:%S")))

        local_binary_file = (
            local_path + remote.list_directory(key_path, master, local_path))

        spark_job = self._submit_job(master, key_path, main_class,
                                     dependencies, local_binary_file, args)

        spark_app_id = spark.get_running_app(master, spark_applications_ids,
                                             number_of_attempts)

        if spark_app_id is None:
            self._log("%s | Error on submission of application, "
                      "please check the config file" %
                      (time.strftime("%H:%M:%S")))

            (output, err) = spark_job.communicate()
            self.stdout.log(output)
            self.stderr.log(err)

            raise ex.ConfigurationError()

        spark_applications_ids.append(spark_app_id)

        info_plugin = {
            "spark_submisson_url": "http://" + master,
            "expected_time": expected_time,
            "number_of_jobs": number_of_jobs
        }

        self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S")))
        monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin,
                              info_plugin, collect_period)
        self._log("%s | Starting controller" % (time.strftime("%H:%M:%S")))
        controller.start_controller(api.controller_url, spark_app_id,
                                    workers_id, data)

        (output, err) = spark_job.communicate()

        self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S")))
        monitor.stop_monitor(api.monitor_url, spark_app_id)
        self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S")))
        controller.stop_controller(api.controller_url, spark_app_id)

        self.stdout.log(output)
        self.stderr.log(err)

        self._log("%s | Copy log from cluster" % (time.strftime("%H:%M:%S")))
        event_log_path = local_path + 'eventlog/'
        self._mkdir(event_log_path)

        remote_event_log_path = 'ubuntu@%s:%s%s' % (master, local_path,
                                                    spark_app_id)

        remote.copy(key_path, remote_event_log_path, event_log_path)

        self._log("%s | Upload log to Swift" % (time.strftime("%H:%M:%S")))
        connector.upload_directory(swift, event_log_path, swift_logdir,
                                   container)

        spark_applications_ids.remove(spark_app_id)

        self.update_application_state("OK")

        return 'OK'
Exemplo n.º 3
0
    def _swift_spark_execution(self, master, key_path, sahara, connector,
                               job_binary_name, job_binary_url, user, password,
                               job_template_name, job_type, plugin,
                               cluster_size, args, main_class, cluster_id,
                               spark_applications_ids, workers_id, app_id,
                               expected_time, monitor_plugin, collect_period,
                               number_of_jobs, log_path, swift, container,
                               data, number_of_attempts):

        # Preparing job
        job_binary_id = self._get_job_binary_id(sahara, connector,
                                                job_binary_name,
                                                job_binary_url, user, password)

        mains = [job_binary_id]
        job_template_id = self._get_job_template_id(sahara, connector, mains,
                                                    job_template_name,
                                                    job_type)

        self._log("%s | Starting job..." % (time.strftime("%H:%M:%S")))

        # Running job
        # What is os_utils?
        # configs = os_utils.get_job_config(connector, plugin,
        #                                   cluster_size, user, password,
        #                                   args, main_class)

        configs = None
        job = connector.create_job_execution(sahara,
                                             job_template_id,
                                             cluster_id,
                                             configs=configs)

        self._log("%s | Created job" % (time.strftime("%H:%M:%S")))

        spark_app_id = spark.get_running_app(master, spark_applications_ids,
                                             number_of_attempts)
        spark_applications_ids.append(spark_app_id)

        self._log("%s | Spark app id" % (time.strftime("%H:%M:%S")))

        job_exec_id = job.id

        for worker_id in workers_id:
            instances_log.log("%s|%s" % (app_id, worker_id))

        job_status = connector.get_job_status(sahara, job_exec_id)

        self._log("%s | Sahara job status: %s" %
                  (time.strftime("%H:%M:%S"), job_status))

        info_plugin = {
            "spark_submisson_url": "http://" + master,
            "expected_time": expected_time,
            "number_of_jobs": number_of_jobs
        }

        self._log("%s | Starting monitor" % (time.strftime("%H:%M:%S")))
        monitor.start_monitor(api.monitor_url, spark_app_id, monitor_plugin,
                              info_plugin, collect_period)
        self._log("%s | Starting controller" % (time.strftime("%H:%M:%S")))
        controller.start_controller(api.controller_url, spark_app_id,
                                    workers_id, data)

        job_status = self._wait_on_job_finish(sahara, connector, job_exec_id,
                                              app_id)

        self._log("%s | Stopping monitor" % (time.strftime("%H:%M:%S")))
        monitor.stop_monitor(api.monitor_url, spark_app_id)
        self._log("%s | Stopping controller" % (time.strftime("%H:%M:%S")))
        controller.stop_controller(api.controller_url, spark_app_id)

        spark_applications_ids.remove(spark_app_id)

        self._log("Finished application execution")

        if connector.is_job_completed(job_status):
            self.update_application_state("OK")

        if connector.is_job_failed(job_status):
            self.update_application_state("Error")

        return job_status
Exemplo n.º 4
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")
            plugin_log.log("%s | Starting application execution" %
                           (time.strftime("%H:%M:%S")))

            binary_url = str(data['binary_url'])
            execution_class = str(data['execution_class'])
            execution_parameters = str(data['execution_parameters'])
            expected_time = int(data['expected_time'])
            number_of_jobs = int(data['number_of_jobs'])
            starting_cap = int(data['starting_cap'])

            # Optimizer integration
            app_name = data['app_name']
            days = 0

            if app_name.lower() == 'bulma':
                if 'days' in data.keys():
                    days = data['days']
                else:
                    self._log("""%s | 'days' parameter missing""" %
                              (time.strftime("%H:%M:%S")))
                    raise ex.ConfigurationError()

            cores, vms = optimizer.get_info(api.optimizer_url, expected_time,
                                            app_name, days)
            optimizer_command = ''
            if cores >= 0:
                optimizer_command = ' --total-executor-cores %d ' % cores

            plugin_log.log("%s | Submission id: %s" %
                           (time.strftime("%H:%M:%S"), self.app_id))

            plugin_log.log("%s | Connecting with Mesos cluster..." %
                           (time.strftime("%H:%M:%S")))

            conn = ssh.get_connection(api.mesos_url, api.cluster_username,
                                      api.cluster_password,
                                      api.cluster_key_path)

            plugin_log.log("%s | Connected with Mesos cluster" %
                           (time.strftime("%H:%M:%S")))

            # Execute all the spark needed commands
            # to run an spark job from command line
            if execution_class != "" and execution_class is not None:
                # If the class field is empty, it means that the
                # job binary is python
                binary_path = '~/exec_bin.jar'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '--class %s %s %s')
            else:
                binary_path = '~/exec_bin.py'
                spark_run = ('sudo %s --name %s ' + '--master mesos://%s:%s ' +
                             optimizer_command + '%s %s %s')

            plugin_log.log("%s | Download the binary to cluster" %
                           (time.strftime("%H:%M:%S")))

            try:
                stdin, stdout, stderr = conn.exec_command(
                    'wget %s -O %s' % (binary_url, binary_path))

                plugin_log.log("%s | Waiting for download the binary..." %
                               (time.strftime("%H:%M:%S")))

                # TODO: Fix possible wget error
                stdout.read()
                plugin_log.log("%s | Binary downloaded" %
                               (time.strftime("%H:%M:%S")))

            except Exception as e:
                plugin_log.log("%s | Error downloading binary" %
                               (time.strftime("%H:%M:%S")))
                self.update_application_state("Error")
                return "Error"

            i, o, e = conn.exec_command(
                spark_run %
                (api.spark_path, self.app_id, api.mesos_url, api.mesos_port,
                 execution_class, binary_path, execution_parameters))

            # Discovery ips of the executors from Mesos
            # and discovery the ids on KVM using the ips
            list_vms_one = (
                'onevm list --user %s --password %s --endpoint %s' %
                (api.one_username, api.one_password, api.one_url))

            stdin, stdout, stderr = conn.exec_command(list_vms_one)

            list_response = stdout.read()

            vms_ips, master = mesos.get_executors_ip(conn, self.frameworks_url,
                                                     self.app_id)
            plugin_log.log("%s | Master: %s" %
                           (time.strftime("%H:%M:%S"), master))

            plugin_log.log("%s | Executors: %s" %
                           (time.strftime("%H:%M:%S"), vms_ips))

            vms_ids = mesos.extract_vms_ids(list_response)
            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), vms_ids))

            executors_vms_ids = []
            for ip in vms_ips:
                for id in vms_ids:
                    vm_info_one = (
                        'onevm show %s '
                        '--user %s '
                        '--password %s '
                        '--endpoint %s' %
                        (id, api.one_username, api.one_password, api.one_url))

                    stdin, stdout, stderr = conn.exec_command(vm_info_one)
                    if ip in stdout.read():
                        executors_vms_ids.append(id)
                        break

            plugin_log.log("%s | Executors IDs: %s" %
                           (time.strftime("%H:%M:%S"), executors_vms_ids))

            # Set up the initial configuration of cpu cap
            controller.setup_environment(api.controller_url, executors_vms_ids,
                                         starting_cap, data)

            info_plugin = {
                "spark_submisson_url": master,
                "expected_time": expected_time,
                "number_of_jobs": number_of_jobs
            }

            plugin_log.log("%s | Starting monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.start_monitor(api.monitor_url, self.app_id, 'spark-mesos',
                                  info_plugin, 2)

            plugin_log.log("%s | Starting controller" %
                           (time.strftime("%H:%M:%S")))
            controller.start_controller(api.controller_url, self.app_id,
                                        executors_vms_ids, data)

            # This command locks the plugin execution
            # until the execution be done
            print o.read()

            plugin_log.log("%s | Stopping monitor" %
                           (time.strftime("%H:%M:%S")))
            monitor.stop_monitor(api.monitor_url, self.app_id)

            plugin_log.log("%s | Stopping controller" %
                           (time.strftime("%H:%M:%S")))
            controller.stop_controller(api.controller_url, self.app_id)

            plugin_log.log("%s | Remove binaries" %
                           (time.strftime("%H:%M:%S")))
            conn.exec_command('rm -rf ~/exec_bin.*')

            plugin_log.log("%s | Finished application execution" %
                           (time.strftime("%H:%M:%S")))

            self.update_application_state("OK")
            return 'OK'

        except Exception as e:
            plugin_log.log(e.message)
            print e.message
            self.update_application_state("Error")
Exemplo n.º 5
0
    def start_application(self, data):
        try:
            self.update_application_state("Running")

            user = api.user
            password = api.password
            project_id = api.project_id
            auth_ip = api.auth_ip
            domain = api.domain
            public_key = api.public_key

            connector = os_connector.OpenStackConnector(LOG)
            nova = connector.get_nova_client(user, password, project_id,
                                             auth_ip, domain)

            monitor_plugin = data['monitor_plugin']
            expected_time = data['expected_time']
            log_path = data['log_path']
            image_id = data['image_id']
            flavor_id = data['flavor_id']
            command = data['command']
            cluster_size = data['cluster_size']
            starting_cap = data['scaling_parameters']["starting_cap"]

            app_start_time = 0
            app_end_time = 0

            LOG.log("Creating instance(s)")
            print "Creating instance(s)..."

            # Create a number of instances to run the application based on
            # cluster_size, image_id, flavor_id and public_key
            instances = self._create_instances(nova, connector, image_id,
                                               flavor_id, public_key,
                                               cluster_size)

            LOG.log("Waiting until instance become active...")
            print "Waiting until instance become active..."

            # Retrive network information from all instances when they
            # reach ACTIVE state
            instances_nets = []
            for instance_id in instances:
                instance_status = connector.get_instance_status(
                    nova, instance_id)
                while instance_status != 'ACTIVE':
                    instance_status = connector.get_instance_status(
                        nova, instance_id)

                instance_ips = connector.get_instance_networks(
                    nova, instance_id)

                instances_nets.append(instance_ips)
                time.sleep(5)

            time.sleep(30)

            LOG.log("Checking if ssh is available")
            print "Checking if ssh is available"

            # Verify if ssh is available for any ip address for each instance
            instances_ips = []
            for instance_net in instances_nets:
                for net_ip_list in instance_net.values():
                    for ip in net_ip_list:

                        attempts = 2
                        while attempts != -1:
                            try:
                                conn = self._get_ssh_connection(
                                    ip, api.key_path)
                                instances_ips.append(ip)
                                attempts = -1

                            except Exception as e:
                                LOG.log("Fail to connect")
                                LOG.log(e.message)

                                print "Fail to connect"
                                print e.message

                                attempts -= 1
                                time.sleep(30)

            LOG.log("Setting up environment")
            print "Setting up environment"

            # Set CPU cap in all instances
            controller.setup_environment(api.controller_url, instances,
                                         starting_cap, data)

            # Execute application and start monitor and controller service.
            applications = []
            for ip in instances_ips:
                LOG.log("Executing commands into the instance")
                print "Executing commands into the instance"

                # TODO Check if exec_command will work without blocking exec

                conn = self._get_ssh_connection(ip, api.key_path)

                conn.exec_command(command)
                app_start_time = time.time()

                app_id = "app-os-generic" + str(uuid.uuid4())[:8]
                applications.append(app_id)

                monitor_plugin = monitor_plugin
                info_plugin = {
                    "host_ip": ip,
                    "log_path": log_path,
                    "expected_time": expected_time
                }

                collect_period = 1
                try:
                    LOG.log("Starting monitoring")
                    print "Starting monitoring"

                    monitor.start_monitor(api.monitor_url, app_id,
                                          monitor_plugin, info_plugin,
                                          collect_period)

                    LOG.log("Starting scaling")
                    print "Starting scaling"

                    controller.start_controller(api.controller_url, app_id,
                                                instances, data)

                except Exception as e:
                    LOG.log(e.message)
                    print e.message

            # Stop monitor and controller when each application stops
            application_running = True
            while application_running:
                status_instances = []
                for instance_id in instances:
                    status = connector.get_instance_status(nova, instance_id)
                    status_instances.append(status)

                if self._instances_down(status_instances):
                    application_running = False
                    app_end_time = time.time()

                    LOG.log("Application finished")
                    print "Application finished"

                    for app_id in applications:
                        LOG.log("Stopping monitoring")
                        print "Stopping monitoring"
                        monitor.stop_monitor(api.monitor_url, app_id)

                        LOG.log("Stopping scaling")
                        print "Stopping scaling"
                        controller.stop_controller(api.controller_url, app_id)

                else:
                    instance_status = []

                time.sleep(2)

            LOG.log("Removing instances...")
            print "Removing instances..."

            # Remove instances after the end of all applications
            self._remove_instances(nova, connector, instances)

            LOG.log("Finished application execution")
            print "Finished application execution"

            application_time = app_end_time - app_start_time
            application_time_log.log(
                "%s|%.0f|%.0f" % (app_id, app_start_time, application_time))

            self.application_time = application_time
            self.start_time = app_start_time
            self.update_application_state("OK")

            return str(application_time)

        except Exception as e:
            LOG.log(e.message)
            print e.message
            self.update_application_state("Error")
Exemplo n.º 6
0
    def start_application(self, data):
        try:

            # Download files that contains the items
            jobs = requests.get(data['redis_workload']).text.\
                                split('\n')[:-1]

            # Provision a redis database for the job. Die in case of error.
            # TODO(clenimar): configure ``timeout`` via a request param,
            # e.g. api.redis_creation_timeout.
            redis_ip, redis_port = self.k8s.provision_redis_or_die(self.app_id)
            #agent_port = k8s.create_cpu_agent(self.app_id)

            # inject REDIS_HOST in the environment
            data['env_vars']['REDIS_HOST'] = 'redis-%s' % self.app_id

            # inject SCONE_CONFIG_ID in the environment
            # FIXME: make SCONE_CONFIG_ID optional in submission
            data['env_vars']['SCONE_CONFIG_ID'] = data['config_id']

            # create a new Redis client and fill the work queue
            if (self.rds == None):
                self.rds = redis.StrictRedis(host=redis_ip, port=redis_port)

            queue_size = len(jobs)

            # Check if a visualizer will be created
            self.enable_visualizer = data['enable_visualizer']

            # Create all visualizer components
            if self.enable_visualizer:
                # Specify the datasource to be used in the visualization
                datasource_type = data['visualizer_info']['datasource_type']

                if datasource_type == "influxdb":
                    database_data = k8s.create_influxdb(self.app_id)
                    #TODO {javan} change name of redis_ip to node_ip in configuration file
                    database_data.update({"url": api.redis_ip})
                    data['monitor_info'].update(
                        {'database_data': database_data})
                    data['visualizer_info'].update(
                        {'database_data': database_data})

                data['monitor_info'].update(
                    {'datasource_type': datasource_type})

                print "Creating Visualization plataform"

                data['visualizer_info'].update({
                    'enable_visualizer':
                    data['enable_visualizer'],
                    'plugin':
                    data['monitor_plugin'],
                    'visualizer_plugin':
                    data['visualizer_plugin'],
                    'username':
                    data['username'],
                    'password':
                    data['password']
                })

                visualizer.start_visualization(api.visualizer_url, self.app_id,
                                               data['visualizer_info'])

                self.visualizer_url = visualizer.get_visualizer_url(
                    api.visualizer_url, self.app_id)

                print "Dashboard of the job created on: %s" % (
                    self.visualizer_url)

            print "Creating Redis queue"
            for job in jobs:
                self.rds.rpush("job", job)

            print "Creating Job"

            self.k8s.create_job(self.app_id,
                                data['cmd'],
                                data['img'],
                                data['init_size'],
                                data['env_vars'],
                                config_id=data["config_id"])

            starting_time = datetime.datetime.now().\
                strftime('%Y-%m-%dT%H:%M:%S.%fGMT')

            # Starting monitor
            data['monitor_info'].update({
                'count_jobs_url':
                api.count_queue,
                'number_of_jobs':
                queue_size,
                'submission_time':
                starting_time,
                'redis_ip':
                redis_ip,
                'redis_port':
                redis_port,
                'enable_visualizer':
                self.enable_visualizer
            })  #,
            #'cpu_agent_port': agent_port})

            monitor.start_monitor(api.monitor_url, self.app_id,
                                  data['monitor_plugin'], data['monitor_info'],
                                  2)

            # Starting controller
            data.update({'redis_ip': redis_ip, 'redis_port': redis_port})
            controller.start_controller_k8s(api.controller_url, self.app_id,
                                            data)

            while not self.job_completed and not self.terminated:
                self.update_application_state("ongoing")
                self.job_completed = self.k8s.completed(self.app_id)
                time.sleep(1)

            # Stop monitor, controller and visualizer

            if (self.get_application_state() == "ongoing"):
                self.update_application_state("completed")

            print "Job finished"

            time.sleep(float(30))

            if self.enable_visualizer:
                visualizer.stop_visualization(api.visualizer_url, self.app_id,
                                              data['visualizer_info'])
            monitor.stop_monitor(api.monitor_url, self.app_id)
            controller.stop_controller(api.controller_url, self.app_id)
            print "Stoped services"

            # delete redis resources
            if not self.get_application_state() == 'terminated':
                self.k8s.delete_redis_resources(self.app_id)

        except Exception as ex:
            self.update_application_state("error")
            print "ERROR: %s" % ex

        print "Application finished."
Exemplo n.º 7
0
    def start_application(self, data):
        try:

            # Download files that contains the items
            jobs = requests.get(data['redis_workload']).text.\
                                split('\n')[:-1]

            # Provision a redis database for the job. Die in case of error.
            # TODO(clenimar): configure ``timeout`` via a request param,
            # e.g. api.redis_creation_timeout.
            redis_ip, redis_port = k8s.provision_redis_or_die(self.app_id)

            self.rds = redis.StrictRedis(host=redis_ip, port=redis_port)
            queue_size = len(jobs)

            print "Creating Redis queue"
            for job in jobs:
                self.rds.rpush("job", job)

            print "Creating Job"

            k8s.create_job(self.app_id,
                           data['cmd'],
                           data['img'],
                           data['init_size'],
                           data['env_vars'],
                           config_id=data["config_id"])

            starting_time = datetime.datetime.now().\
                strftime('%Y-%m-%dT%H:%M:%S.%fGMT')

            # Starting monitor
            data['monitor_info'].update({
                'count_jobs_url':
                api.count_queue,
                'number_of_jobs':
                queue_size,
                'submission_time':
                starting_time,
                'redis_ip':
                redis_ip,
                'redis_port':
                redis_port,
                'graphic_metrics':
                data['graphic_metrics']
            })

            monitor.start_monitor(api.monitor_url, self.app_id,
                                  data['monitor_plugin'], data['monitor_info'],
                                  2)

            # Starting controller
            data.update({'redis_ip': redis_ip, 'redis_port': redis_port})
            controller.start_controller_k8s(api.controller_url, self.app_id,
                                            data)

            while not self.job_completed and not self.terminated:
                self.update_application_state("ongoing")
                self.job_completed = k8s.completed(self.app_id)
                time.sleep(1)

            # Stop monitor and controller

            if (self.get_application_state() == "ongoing"):
                self.update_application_state("completed")

            print "job finished"
            monitor.stop_monitor(api.monitor_url, self.app_id)
            controller.stop_controller(api.controller_url, self.app_id)
            print "stoped services"

            # delete redis resources
            time.sleep(float(30))
            if not self.get_application_state() == 'terminated':
                k8s.delete_redis_resources(self.app_id)

        except Exception as ex:
            self.update_application_state("error")
            print "ERROR: %s" % ex

        print "Application finished."