예제 #1
0
    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()
            self.job_template = job_template

            msg = MsgBundle.getMessage(
                MsgBundle.START_JOB_MESSAGE,
                job_execution_id=self.job_execution_id,
                job_template_name=job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list]

            for i in range(0, len(playbook_list)):

                # check if its a multi device playbook
                playbooks = job_template.get_job_template_playbooks()
                play_info = playbooks.playbook_info[i]
                multi_device_playbook = play_info.multi_device_playbook

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template,
                                         self.result_handler, self.job_utils,
                                         i, job_percent, self._zk_client)
                    self.job_mgr = job_mgr
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    if job_status == JobStatus.FAILURE or not retry_devices \
                            or self.abort_flag:
                        break
                    self.job_input['device_json'] = retry_devices

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                if pb_output.get('early_exit'):
                    break

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # stop workflow only if its a single device job or
                    # it is a multi device playbook
                    # and all the devices have failed some job execution
                    # declare it as failure and the stop the workflow

                    if not multi_device_playbook or \
                            (multi_device_playbook and
                             len(self.result_handler.failed_device_jobs) ==
                             len(self.job_input.get('device_json'))):
                        self._logger.error(
                            "Stop the workflow on the failed Playbook.")
                        break

                    elif not retry_devices:
                        # it is a multi device playbook but one of
                        # the device jobs have failed. This means we should
                        # still declare the operation as success. We declare
                        # workflow as success even if one of the devices has
                        # succeeded the job

                        self.result_handler.job_result_status =\
                            JobStatus.SUCCESS

                if self.abort_flag:
                    err_msg = "ABORTING NOW..."
                    self._logger.info(err_msg)
                    self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not multi_device_playbook:
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(
                job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #2
0
class ExecutableManager(object):
    def __init__(self, logger, vnc_api, job_input, job_log_utils ):
        self._logger = logger
        self.vnc_api = vnc_api
        self.vnc_api_init_params = None
        self.api_server_host = None
        self.auth_token = None
        self.contrail_cluster_id = None
        self.sandesh_args = None
        self.job_log_utils = job_log_utils
        self.job_input = job_input
        self.job_utils = None
        self.executable_timeout = 1800
        self.job_template = None
        self.job_execution_id = None
        self.job_template_id = None
        self.result_handler = None
        self.parse_job_input(job_input)
        self.job_utils = JobUtils(self.job_execution_id,
                                  self.job_template_id,
                                  self._logger, self.vnc_api)
        self.job_template = self.job_utils.read_job_template()
        self.job_file_write = JobFileWrite(self._logger)

    def parse_job_input(self, job_input_json):
        # job input should have job_template_id and execution_id field
        self.job_template_id = job_input_json.get('job_template_id')
        self.job_execution_id = job_input_json.get('job_execution_id')
        self.job_data = job_input_json.get('input')
        self.fabric_fq_name = job_input_json.get('fabric_fq_name')
        self.auth_token = job_input_json.get('auth_token')
        self.contrail_cluster_id = job_input_json.get('contrail_cluster_id')
        self.sandesh_args = job_input_json.get('args')
        self.vnc_api_init_params = job_input_json.get('vnc_api_init_params')
        self.api_server_host = job_input_json.get('api_server_host')

    def _validate_job_input(self, input_schema, ip_json):
        if ip_json is None:
            msg = MsgBundle.getMessage(
                MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND)
            raise JobException(msg,
                               self.job_execution_id)
        try:
            ip_schema_json = input_schema
            if isinstance(input_schema, basestring):
                ip_schema_json = json.loads(input_schema)
            jsonschema.validate(ip_json, ip_schema_json)
            self._logger.error("Input Schema Validation Successful"
                               "for template %s" % self.job_template_id)
        except Exception as exp:
            msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA,
                                       job_template_id=self.job_template_id,
                                       exc_obj=exp)
            raise JobException(msg, self.job_execution_id)

    def gather_job_args(self):
            extra_vars = {
                'input': self.job_data,
                'job_template_id': self.job_template.get_uuid(),
                'job_template_fqname': self.job_template.fq_name,
                'fabric_fq_name': self.fabric_fq_name,
                'auth_token': self.auth_token,
                'contrail_cluster_id': self.contrail_cluster_id,
                'api_server_host': self.api_server_host,
                'job_execution_id': self.job_execution_id ,
                'sandesh_args': self.sandesh_args,
                'vnc_api_init_params': self.vnc_api_init_params,
            }
            return extra_vars

    def start_job(self):
        self._logger.info("Starting Executable")
        job_error_msg = None
        job_template = self.job_template
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)


            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id,
                                       job_template_name=\
                                           job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            executable_list = job_template.get_job_template_executables()\
                .get_executable_info()
            for executable in executable_list:
                exec_path = executable.get_executable_path()
                exec_args = executable.get_executable_args()
                job_input_args = self.gather_job_args()
                try:
                    exec_process = subprocess32.Popen([exec_path,
                                                   "--job-input",
                                                   json.dumps(job_input_args),
                                                   '--debug', 'True'],
                                                  close_fds=True, cwd='/',
                                                  stdout=subprocess32.PIPE,
                                                  stderr=subprocess32.PIPE)
                    self.job_file_write.write_to_file(
                        self.job_execution_id,
                        "job_summary",
                        JobFileWrite.JOB_LOG,
                        {"job_status": "INPROGRESS"})
                    msg = "Child process pid = " + str(exec_process.pid)
                    self._logger.info(msg)
                    (out, err) = exec_process.communicate(timeout=self.executable_timeout)

                    self._logger.notice(str(out))
                    self._logger.notice(str(err))
                except subprocess32.TimeoutExpired as timeout_exp:
                    if exec_process is not None:
                        os.kill(exec_process.pid, 9)
                        msg = MsgBundle.getMessage(
                                  MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT,
                                  exec_path=exec_path,
                                  exc_msg=repr(timeout_exp))
                        raise JobException(msg, self.job_execution_id)

                self._logger.info(exec_process.returncode)
                self._logger.info("Executable Completed")
                if exec_process.returncode != 0:
                     self.job_file_write.write_to_file(
                         self.job_execution_id,
                         "job_summary",
                         JobFileWrite.JOB_LOG,
                         {"job_status": "FAILED"})
                     msg = MsgBundle.getMessage(MsgBundle.
                                   EXECUTABLE_RETURN_WITH_ERROR,
                                   exec_uri=exec_path)
                     self._logger.error(msg)
                else:
                    self.job_file_write.write_to_file(
                        self.job_execution_id,
                        "job_summary",
                        JobFileWrite.JOB_LOG,
                        {"job_status": "COMPLETED"})


        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #3
0
class WFManager(object):

    def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client):
        """Initializes workflow manager."""
        self._logger = logger
        self._vnc_api = vnc_api
        self.job_input = job_input
        self.job_log_utils = job_log_utils
        self.job_execution_id = None
        self.job_template_id = None
        self.device_json = None
        self.result_handler = None
        self.job_data = None
        self.fabric_fq_name = None
        self.parse_job_input(job_input)
        self.job_utils = JobUtils(self.job_execution_id,
                                  self.job_template_id,
                                  self._logger, self._vnc_api)
        self._zk_client = zk_client
        self.job_mgr = None
        self.job_template = None
        self.abort_flag = False
        signal.signal(signal.SIGABRT,  self.job_mgr_abort_signal_handler)
        signal.signal(signal.SIGUSR1,  self.job_mgr_abort_signal_handler)
        logger.debug("Job manager initialized")

    def parse_job_input(self, job_input_json):
        # job input should have job_template_id and execution_id field
        if job_input_json.get('job_template_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING)
            raise Exception(msg)

        if job_input_json.get('job_execution_id') is None:
            msg = MsgBundle.getMessage(
                MsgBundle.JOB_EXECUTION_ID_MISSING)
            raise Exception(msg)

        self.job_template_id = job_input_json.get('job_template_id')
        self.job_execution_id = job_input_json.get('job_execution_id')
        self.job_data = job_input_json.get('input')
        self.fabric_fq_name = job_input_json.get('fabric_fq_name')

    def _validate_job_input(self, input_schema, ip_json):
        if ip_json is None:
            msg = MsgBundle.getMessage(
                MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND)
            raise JobException(msg,
                               self.job_execution_id)
        try:
            ip_schema_json = input_schema
            if isinstance(input_schema, basestring):
                ip_schema_json = json.loads(input_schema)
            jsonschema.validate(ip_json, ip_schema_json)
            self._logger.debug("Input Schema Validation Successful"
                               "for template %s" % self.job_template_id)
        except Exception as exp:
            msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA,
                                       job_template_id=self.job_template_id,
                                       exc_obj=exp)
            raise JobException(msg, self.job_execution_id)

    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()
            self.job_template = job_template

            msg = MsgBundle.getMessage(
                MsgBundle.START_JOB_MESSAGE,
                job_execution_id=self.job_execution_id,
                job_template_name=job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list]

            for i in range(0, len(playbook_list)):

                # check if its a multi device playbook
                playbooks = job_template.get_job_template_playbooks()
                play_info = playbooks.playbook_info[i]
                multi_device_playbook = play_info.multi_device_playbook

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template,
                                         self.result_handler, self.job_utils,
                                         i, job_percent, self._zk_client)
                    self.job_mgr = job_mgr
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    if job_status == JobStatus.FAILURE or not retry_devices \
                            or self.abort_flag:
                        break
                    self.job_input['device_json'] = retry_devices

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                if pb_output.get('early_exit'):
                    break

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # stop workflow only if its a single device job or
                    # it is a multi device playbook
                    # and all the devices have failed some job execution
                    # declare it as failure and the stop the workflow

                    if not multi_device_playbook or \
                            (multi_device_playbook and
                             len(self.result_handler.failed_device_jobs) ==
                             len(self.job_input.get('device_json'))):
                        self._logger.error(
                            "Stop the workflow on the failed Playbook.")
                        break

                    elif not retry_devices:
                        # it is a multi device playbook but one of
                        # the device jobs have failed. This means we should
                        # still declare the operation as success. We declare
                        # workflow as success even if one of the devices has
                        # succeeded the job

                        self.result_handler.job_result_status =\
                            JobStatus.SUCCESS

                if self.abort_flag:
                    err_msg = "ABORTING NOW..."
                    self._logger.info(err_msg)
                    self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not multi_device_playbook:
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(
                job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)

    def job_mgr_abort_signal_handler(self, signalnum, frame):
        if signalnum == signal.SIGABRT:
            # Force abort; kill all playbooks, then exit
            err_msg = "Job aborting..."
            self._logger.info(err_msg)
            try:
                self.job_mgr.job_handler.playbook_abort()
                self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
                self.result_handler.create_job_summary_log(self.job_template.fq_name)
                sys.exit()
            except Exception as ex:
                self._logger.error("Failed to force abort")
        elif signalnum == signal.SIGUSR1:
            # Graceful abort; Exit after current playbook
            self._logger.info("Job will abort upon playbook completion...")
            self.abort_flag = True
예제 #4
0
    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id)
            self._logger.debug(msg)

            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list
                ]

            for i in range(0, len(playbook_list)):

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                job_mgr = JobManager(self._logger, self._vnc_api,
                                     self.job_input, self.job_log_utils,
                                     job_template, self.result_handler,
                                     self.job_utils, i, job_percent)

                job_mgr.start_job()

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:
                    self._logger.error(
                        "Stop the workflow on the failed Playbook.")
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not self.job_input.get('device_json'):
                    device_json = pb_output.get('device_json')
                    self.job_input['device_json'] = device_json

                if not self.job_input.get('prev_pb_output'):
                    self.job_input['prev_pb_output'] = pb_output
                else:
                    self.job_input['prev_pb_output'].update(pb_output)
                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #5
0
    def start_job(self):
        self._logger.info("Starting Executable")
        job_error_msg = None
        job_template = self.job_template
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)


            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id,
                                       job_template_name=\
                                           job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            executable_list = job_template.get_job_template_executables()\
                .get_executable_info()
            for executable in executable_list:
                exec_path = executable.get_executable_path()
                exec_args = executable.get_executable_args()
                job_input_args = self.gather_job_args()
                try:
                    exec_process = subprocess32.Popen([exec_path,
                                                   "--job-input",
                                                   json.dumps(job_input_args),
                                                   '--debug', 'True'],
                                                  close_fds=True, cwd='/',
                                                  stdout=subprocess32.PIPE,
                                                  stderr=subprocess32.PIPE)
                    self.job_file_write.write_to_file(
                        self.job_execution_id,
                        "job_summary",
                        JobFileWrite.JOB_LOG,
                        {"job_status": "INPROGRESS"})
                    msg = "Child process pid = " + str(exec_process.pid)
                    self._logger.info(msg)
                    (out, err) = exec_process.communicate(timeout=self.executable_timeout)

                    self._logger.notice(str(out))
                    self._logger.notice(str(err))
                except subprocess32.TimeoutExpired as timeout_exp:
                    if exec_process is not None:
                        os.kill(exec_process.pid, 9)
                        msg = MsgBundle.getMessage(
                                  MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT,
                                  exec_path=exec_path,
                                  exc_msg=repr(timeout_exp))
                        raise JobException(msg, self.job_execution_id)

                self._logger.info(exec_process.returncode)
                self._logger.info("Executable Completed")
                if exec_process.returncode != 0:
                     self.job_file_write.write_to_file(
                         self.job_execution_id,
                         "job_summary",
                         JobFileWrite.JOB_LOG,
                         {"job_status": "FAILED"})
                     msg = MsgBundle.getMessage(MsgBundle.
                                   EXECUTABLE_RETURN_WITH_ERROR,
                                   exec_uri=exec_path)
                     self._logger.error(msg)
                else:
                    self.job_file_write.write_to_file(
                        self.job_execution_id,
                        "job_summary",
                        JobFileWrite.JOB_LOG,
                        {"job_status": "COMPLETED"})


        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #6
0
class WFManager(object):
    def __init__(self, logger, vnc_api, job_input, job_log_utils):
        self._logger = logger
        self._vnc_api = vnc_api
        self.job_input = job_input
        self.job_log_utils = job_log_utils
        self.job_execution_id = None
        self.job_template_id = None
        self.device_json = None
        self.result_handler = None
        self.job_data = None
        self.parse_job_input(job_input)
        self.job_utils = JobUtils(self.job_execution_id, self.job_template_id,
                                  self._logger, self._vnc_api)
        logger.debug("Job manager initialized")

    def parse_job_input(self, job_input_json):
        # job input should have job_template_id and execution_id field
        if job_input_json.get('job_template_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING)
            raise Exception(msg)

        if job_input_json.get('job_execution_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_EXECUTION_ID_MISSING)
            raise Exception(msg)

        self.job_template_id = job_input_json.get('job_template_id')
        self.job_execution_id = job_input_json['job_execution_id']
        self.job_data = job_input_json.get('input')
        self.fabric_fq_name = job_input_json.get('fabric_fq_name')

    def _validate_job_input(self, input_schema, ip_json):
        if ip_json is None:
            msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND)
            raise JobException(msg, self.job_execution_id)
        try:
            ip_schema_json = input_schema
            if isinstance(input_schema, str):
                ip_schema_json = json.loads(input_schema)
            jsonschema.validate(ip_json, ip_schema_json)
            self._logger.debug("Input Schema Validation Successful"
                               "for template %s" % self.job_template_id)
        except Exception as exp:
            msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA,
                                       job_template_id=self.job_template_id,
                                       exc_obj=exp)
            raise JobException(msg, self.job_execution_id)

    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id)
            self._logger.debug(msg)

            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list
                ]

            for i in range(0, len(playbook_list)):

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                job_mgr = JobManager(self._logger, self._vnc_api,
                                     self.job_input, self.job_log_utils,
                                     job_template, self.result_handler,
                                     self.job_utils, i, job_percent)

                job_mgr.start_job()

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:
                    self._logger.error(
                        "Stop the workflow on the failed Playbook.")
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not self.job_input.get('device_json'):
                    device_json = pb_output.get('device_json')
                    self.job_input['device_json'] = device_json

                if not self.job_input.get('prev_pb_output'):
                    self.job_input['prev_pb_output'] = pb_output
                else:
                    self.job_input['prev_pb_output'].update(pb_output)
                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #7
0
    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()

            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id,
                                       job_template_name=\
                                           job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list]

            for i in range(0, len(playbook_list)):

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template,
                                         self.result_handler, self.job_utils, i,
                                         job_percent, self._zk_client,
                                         self.db_init_params,
                                         self.cluster_id)
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    if job_status == JobStatus.FAILURE or not retry_devices:
                        break
                    self.job_input['device_json'] = retry_devices

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # stop workflow only if its a single device job or
                    # it is a multi device playbook
                    # and all the devices have failed some job execution
                    # declare it as failure and the stop the workflow

                    if self.job_input.get('device_json') is None or\
                        len(self.result_handler.failed_device_jobs)\
                            == len(self.job_input.get('device_json')):
                        self._logger.error(
                            "Stop the workflow on the failed Playbook.")
                        break

                    elif not retry_devices:
                        # it is a multi device playbook but one of the device jobs
                        # have failed. This means we should still declare
                        # the operation as success. We declare workflow as
                        # success even if one of the devices has succeeded the job

                        self.result_handler.job_result_status = JobStatus.SUCCESS

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not self.job_input.get('device_json'):
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(
                job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #8
0
    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            job_template = self.job_utils.read_job_template()
            self.job_template = job_template
            self.job_description = self.job_template.display_name
            if not self.job_transaction_descr:
                self.job_transaction_descr = self._generate_transaction_descr()

            self.result_handler = JobResultHandler(
                self.job_template_id, self.job_execution_id,
                self.fabric_fq_name, self._logger, self.job_utils,
                self.job_log_utils, self.device_name, self.job_description,
                self.job_transaction_id, self.job_transaction_descr)

            msg = MsgBundle.getMessage(
                MsgBundle.START_JOB_MESSAGE,
                job_execution_id=self.job_execution_id,
                job_template_name=job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(
                job_template.fq_name,
                self.job_execution_id,
                self.fabric_fq_name,
                msg,
                JobStatus.STARTING.value,
                timestamp=timestamp,
                device_name=self.device_name,
                description=self.job_description,
                transaction_id=self.job_transaction_id,
                transaction_descr=self.job_transaction_descr)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list
                ]

            cleanup_in_progress = False
            cleanup_completed = False
            pb_idx = 0

            while pb_idx < len(playbook_list):

                # check if its a multi device playbook
                playbooks = job_template.get_job_template_playbooks()
                play_info = playbooks.playbook_info[pb_idx]
                multi_device_playbook = play_info.multi_device_playbook
                playbook_name = play_info.playbook_uri.split('/')[-1]

                if cleanup_in_progress:
                    # If we need to cleanup due to a previous error, ignore
                    # any playbooks that don't perform recovery
                    if not play_info.recovery_playbook:
                        self._logger.info("Ignoring playbook %s since it "
                                          "does not perform recovery" %
                                          playbook_name)
                        pb_idx += 1
                        continue

                    # If we are running a recovery playbook, then
                    # cleanup_completed needs to be set irrespective of
                    # a success or error in recovery playbook execution
                    else:
                        self._logger.info("Running recovery playbook %s" %
                                          playbook_name)
                        cleanup_completed = True
                else:
                    # Don't run a recovery playbook if we haven't hit an error
                    if play_info.recovery_playbook:
                        self._logger.info(
                            "Ignoring recovery playbook %s since we "
                            "haven't hit an error" % playbook_name)
                        pb_idx += 1
                        continue

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=pb_idx + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template, self.result_handler,
                                         self.job_utils, pb_idx, job_percent,
                                         self._zk_client, self.job_description,
                                         self.job_transaction_id,
                                         self.job_transaction_descr)
                    self.job_mgr = job_mgr
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    failed_device_list = self.result_handler\
                        .get_failed_device_list()
                    if job_status == JobStatus.FAILURE or not retry_devices \
                            or self.abort_flag:
                        break
                    self.job_input['device_json'] = retry_devices
                    self.job_input['input']['failed_list'] = failed_device_list

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                if pb_output.get('early_exit'):
                    break

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # If it is a single device job or
                    # if it is a multi device playbook
                    # and all the devices have failed some job execution,
                    # declare it as failure, perform cleanup if possible
                    # and then stop the workflow

                    if not multi_device_playbook or \
                            (multi_device_playbook and
                             len(self.result_handler.failed_device_jobs) ==
                             len(self.job_input.get('device_json'))):
                        if not cleanup_in_progress:
                            cleanup_in_progress = True
                            pb_idx = 0
                            self._logger.info("Stop the workflow on the failed"
                                              " Playbook and start cleanup")
                        else:
                            pb_idx += 1
                        continue

                    elif not retry_devices:
                        # it is a multi device playbook but one of
                        # the device jobs have failed. This means we should
                        # still declare the operation as success. We declare
                        # workflow as success even if one of the devices has
                        # succeeded the job

                        self.result_handler.job_result_status =\
                            JobStatus.SUCCESS

                if self.abort_flag:
                    err_msg = "ABORTING NOW..."
                    self._logger.info(err_msg)
                    self.result_handler.update_job_status(
                        JobStatus.FAILURE, err_msg)
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not multi_device_playbook:
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

                pb_idx += 1

            # A successful recovery playbook execution might
            # set JobStatus to success but this does not indicate a
            # success in the workflow. Set JobStatus to failure again.
            if cleanup_completed:
                err_msg = "Finished cleaning up after the error"
                self.result_handler.update_job_status(JobStatus.FAILURE,
                                                      err_msg)
                cleanup_completed = False
                cleanup_in_progress = False

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)
예제 #9
0
class WFManager(object):
    def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client):
        """Initialize workflow manager."""
        self._logger = logger
        self._vnc_api = vnc_api
        self.job_input = job_input
        self.job_log_utils = job_log_utils
        self.job_execution_id = None
        self.job_description = None
        self.job_transaction_id = None
        self.job_transaction_descr = None
        self.job_template_id = None
        self.device_json = None
        self.device_name = ""
        self.result_handler = None
        self.job_data = None
        self.fabric_fq_name = None
        self.parse_job_input(job_input)
        self.job_utils = JobUtils(self.job_execution_id, self.job_template_id,
                                  self._logger, self._vnc_api)
        self._zk_client = zk_client
        self.job_mgr = None
        self.job_template = None
        self.abort_flag = False
        signal.signal(signal.SIGABRT, self.job_mgr_abort_signal_handler)
        signal.signal(signal.SIGUSR1, self.job_mgr_abort_signal_handler)
        logger.debug("Job manager initialized")

    def parse_job_input(self, job_input_json):
        # job input should have job_template_id and execution_id field
        if job_input_json.get('job_template_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING)
            raise Exception(msg)

        if job_input_json.get('job_execution_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_EXECUTION_ID_MISSING)
            raise Exception(msg)

        self.device_json = job_input_json.get('device_json')
        self.job_description = job_input_json.get('job_description', "")
        self.job_template_id = job_input_json.get('job_template_id')
        self.job_execution_id = job_input_json.get('job_execution_id')
        self.job_transaction_id = \
            job_input_json.get('job_transaction_id', self.job_execution_id)
        self.job_transaction_descr = \
            job_input_json.get('job_transaction_descr')
        self.job_data = job_input_json.get('input')
        self.fabric_fq_name = job_input_json.get('fabric_fq_name')
        self.device_name = self._get_device_name()

    def _validate_job_input(self, input_schema, ip_json):
        if ip_json is None:
            msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND)
            raise JobException(msg, self.job_execution_id)
        try:
            ip_schema_json = input_schema
            if isinstance(input_schema, basestring):
                ip_schema_json = json.loads(input_schema)
            jsonschema.validate(ip_json, ip_schema_json)
            self._logger.debug("Input Schema Validation Successful"
                               "for template %s" % self.job_template_id)
        except Exception as exp:
            msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA,
                                       job_template_id=self.job_template_id,
                                       exc_obj=exp)
            raise JobException(msg, self.job_execution_id)

    def _generate_transaction_descr(self):
        transaction_descr = self.job_template.display_name
        if self.device_json:
            transaction_descr += " for "
            device_uuid_list = list(self.device_json.keys())
            for device_uuid in device_uuid_list:
                device_info = self.device_json[device_uuid]
                device_fqname = device_info.get('device_fqname')
                if device_fqname:
                    device_name = device_fqname[-1]
                    transaction_descr += device_name + " "
                    if len(device_uuid_list) == 1:
                        self.device_name = device_name
        return transaction_descr

    def _get_device_name(self):
        if self.device_json:
            device_uuid_list = list(self.device_json.keys())
            if len(device_uuid_list) == 1:
                device_info = self.device_json[device_uuid_list[0]]
                device_fqname = device_info.get('device_fqname')
                if device_fqname:
                    return device_fqname[-1]
        return ""

    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            job_template = self.job_utils.read_job_template()
            self.job_template = job_template
            self.job_description = self.job_template.display_name
            if not self.job_transaction_descr:
                self.job_transaction_descr = self._generate_transaction_descr()

            self.result_handler = JobResultHandler(
                self.job_template_id, self.job_execution_id,
                self.fabric_fq_name, self._logger, self.job_utils,
                self.job_log_utils, self.device_name, self.job_description,
                self.job_transaction_id, self.job_transaction_descr)

            msg = MsgBundle.getMessage(
                MsgBundle.START_JOB_MESSAGE,
                job_execution_id=self.job_execution_id,
                job_template_name=job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(
                job_template.fq_name,
                self.job_execution_id,
                self.fabric_fq_name,
                msg,
                JobStatus.STARTING.value,
                timestamp=timestamp,
                device_name=self.device_name,
                description=self.job_description,
                transaction_id=self.job_transaction_id,
                transaction_descr=self.job_transaction_descr)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list
                ]

            cleanup_in_progress = False
            cleanup_completed = False
            pb_idx = 0

            while pb_idx < len(playbook_list):

                # check if its a multi device playbook
                playbooks = job_template.get_job_template_playbooks()
                play_info = playbooks.playbook_info[pb_idx]
                multi_device_playbook = play_info.multi_device_playbook
                playbook_name = play_info.playbook_uri.split('/')[-1]

                if cleanup_in_progress:
                    # If we need to cleanup due to a previous error, ignore
                    # any playbooks that don't perform recovery
                    if not play_info.recovery_playbook:
                        self._logger.info("Ignoring playbook %s since it "
                                          "does not perform recovery" %
                                          playbook_name)
                        pb_idx += 1
                        continue

                    # If we are running a recovery playbook, then
                    # cleanup_completed needs to be set irrespective of
                    # a success or error in recovery playbook execution
                    else:
                        self._logger.info("Running recovery playbook %s" %
                                          playbook_name)
                        cleanup_completed = True
                else:
                    # Don't run a recovery playbook if we haven't hit an error
                    if play_info.recovery_playbook:
                        self._logger.info(
                            "Ignoring recovery playbook %s since we "
                            "haven't hit an error" % playbook_name)
                        pb_idx += 1
                        continue

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=pb_idx + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template, self.result_handler,
                                         self.job_utils, pb_idx, job_percent,
                                         self._zk_client, self.job_description,
                                         self.job_transaction_id,
                                         self.job_transaction_descr)
                    self.job_mgr = job_mgr
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    failed_device_list = self.result_handler\
                        .get_failed_device_list()
                    if job_status == JobStatus.FAILURE or not retry_devices \
                            or self.abort_flag:
                        break
                    self.job_input['device_json'] = retry_devices
                    self.job_input['input']['failed_list'] = failed_device_list

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                if pb_output.get('early_exit'):
                    break

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # If it is a single device job or
                    # if it is a multi device playbook
                    # and all the devices have failed some job execution,
                    # declare it as failure, perform cleanup if possible
                    # and then stop the workflow

                    if not multi_device_playbook or \
                            (multi_device_playbook and
                             len(self.result_handler.failed_device_jobs) ==
                             len(self.job_input.get('device_json'))):
                        if not cleanup_in_progress:
                            cleanup_in_progress = True
                            pb_idx = 0
                            self._logger.info("Stop the workflow on the failed"
                                              " Playbook and start cleanup")
                        else:
                            pb_idx += 1
                        continue

                    elif not retry_devices:
                        # it is a multi device playbook but one of
                        # the device jobs have failed. This means we should
                        # still declare the operation as success. We declare
                        # workflow as success even if one of the devices has
                        # succeeded the job

                        self.result_handler.job_result_status =\
                            JobStatus.SUCCESS

                if self.abort_flag:
                    err_msg = "ABORTING NOW..."
                    self._logger.info(err_msg)
                    self.result_handler.update_job_status(
                        JobStatus.FAILURE, err_msg)
                    break

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not multi_device_playbook:
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

                pb_idx += 1

            # A successful recovery playbook execution might
            # set JobStatus to success but this does not indicate a
            # success in the workflow. Set JobStatus to failure again.
            if cleanup_completed:
                err_msg = "Finished cleaning up after the error"
                self.result_handler.update_job_status(JobStatus.FAILURE,
                                                      err_msg)
                cleanup_completed = False
                cleanup_in_progress = False

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE, err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)

    def job_mgr_abort_signal_handler(self, signalnum, frame):
        if signalnum == signal.SIGABRT:
            # Force abort; kill all playbooks, then exit
            err_msg = "Job aborting..."
            self._logger.info(err_msg)
            try:
                self.job_mgr.job_handler.playbook_abort()
                self.result_handler.update_job_status(JobStatus.FAILURE,
                                                      err_msg)
                self.result_handler.create_job_summary_log(
                    self.job_template.fq_name)
                sys.exit()
            except Exception:
                self._logger.error("Failed to force abort")
        elif signalnum == signal.SIGUSR1:
            # Graceful abort; Exit after current playbook
            self._logger.info("Job will abort upon playbook completion...")
            self.abort_flag = True
예제 #10
0
class WFManager(object):

    def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client):
        self._logger = logger
        self._vnc_api = vnc_api
        self.job_input = job_input
        self.job_log_utils = job_log_utils
        self.job_execution_id = None
        self.job_template_id = None
        self.device_json = None
        self.result_handler = None
        self.job_data = None
        self.fabric_fq_name = None
        self.parse_job_input(job_input)
        self.job_utils = JobUtils(self.job_execution_id,
                                  self.job_template_id,
                                  self._logger, self._vnc_api)
        self._zk_client = zk_client
        logger.debug("Job manager initialized")

    def parse_job_input(self, job_input_json):
        # job input should have job_template_id and execution_id field
        if job_input_json.get('job_template_id') is None:
            msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING)
            raise Exception(msg)

        if job_input_json.get('job_execution_id') is None:
            msg = MsgBundle.getMessage(
                MsgBundle.JOB_EXECUTION_ID_MISSING)
            raise Exception(msg)

        self.job_template_id = job_input_json.get('job_template_id')
        self.job_execution_id = job_input_json.get('job_execution_id')
        self.job_data = job_input_json.get('input')
        self.fabric_fq_name = job_input_json.get('fabric_fq_name')

    def _validate_job_input(self, input_schema, ip_json):
        if ip_json is None:
            msg = MsgBundle.getMessage(
                MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND)
            raise JobException(msg,
                               self.job_execution_id)
        try:
            ip_schema_json = input_schema
            if isinstance(input_schema, basestring):
                ip_schema_json = json.loads(input_schema)
            jsonschema.validate(ip_json, ip_schema_json)
            self._logger.debug("Input Schema Validation Successful"
                               "for template %s" % self.job_template_id)
        except Exception as exp:
            msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA,
                                       job_template_id=self.job_template_id,
                                       exc_obj=exp)
            raise JobException(msg, self.job_execution_id)

    def start_job(self):
        job_error_msg = None
        job_template = None
        try:
            # create job UVE and log
            self.result_handler = JobResultHandler(self.job_template_id,
                                                   self.job_execution_id,
                                                   self.fabric_fq_name,
                                                   self._logger,
                                                   self.job_utils,
                                                   self.job_log_utils)

            job_template = self.job_utils.read_job_template()

            msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE,
                                       job_execution_id=self.job_execution_id,
                                       job_template_name=\
                                           job_template.fq_name[-1])
            self._logger.debug(msg)

            timestamp = int(round(time.time() * 1000))
            self.job_log_utils.send_job_log(job_template.fq_name,
                                            self.job_execution_id,
                                            self.fabric_fq_name,
                                            msg,
                                            JobStatus.STARTING.value,
                                            timestamp=timestamp)

            # validate job input if required by job_template input_schema
            input_schema = job_template.get_job_template_input_schema()
            if input_schema:
                self._validate_job_input(input_schema, self.job_data)

            playbook_list = job_template.get_job_template_playbooks()\
                .get_playbook_info()

            job_percent = None
            # calculate job percentage for each playbook
            if len(playbook_list) > 1:
                task_weightage_array = [
                    pb_info.job_completion_weightage
                    for pb_info in playbook_list]

            for i in range(0, len(playbook_list)):

                # check if its a multi device playbook
                playbooks = job_template.get_job_template_playbooks()
                play_info = playbooks.playbook_info[i]
                multi_device_playbook = play_info.multi_device_playbook

                if len(playbook_list) > 1:
                    # get the job percentage based on weightage of each plabook
                    # when they are chained
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100, task_seq_number=i + 1,
                            task_weightage_array=task_weightage_array)[0]
                else:
                    job_percent = \
                        self.job_log_utils.calculate_job_percentage(
                            len(playbook_list), buffer_task_percent=True,
                            total_percent=100)[0]  # using equal weightage

                retry_devices = None
                while True:
                    job_mgr = JobManager(self._logger, self._vnc_api,
                                         self.job_input, self.job_log_utils,
                                         job_template,
                                         self.result_handler, self.job_utils, i,
                                         job_percent, self._zk_client)
                    job_mgr.start_job()

                    # retry the playbook execution if retry_devices is added to
                    # the playbook output
                    job_status = self.result_handler.job_result_status
                    retry_devices = self.result_handler.get_retry_devices()
                    if job_status == JobStatus.FAILURE or not retry_devices:
                        break
                    self.job_input['device_json'] = retry_devices

                # update the job input with marked playbook output json
                pb_output = self.result_handler.playbook_output or {}

                if pb_output.get('early_exit'):
                    break

                # stop the workflow if playbook failed
                if self.result_handler.job_result_status == JobStatus.FAILURE:

                    # stop workflow only if its a single device job or
                    # it is a multi device playbook
                    # and all the devices have failed some job execution
                    # declare it as failure and the stop the workflow

                    if not multi_device_playbook or \
                            (multi_device_playbook and
                             len(self.result_handler.failed_device_jobs) == \
                             len(self.job_input.get('device_json'))):
                        self._logger.error(
                            "Stop the workflow on the failed Playbook.")
                        break

                    elif not retry_devices:
                        # it is a multi device playbook but one of the device jobs
                        # have failed. This means we should still declare
                        # the operation as success. We declare workflow as
                        # success even if one of the devices has succeeded the job

                        self.result_handler.job_result_status = JobStatus.SUCCESS

                # read the device_data output of the playbook
                # and update the job input so that it can be used in next
                # iteration
                if not multi_device_playbook:
                    device_json = pb_output.pop('device_json', None)
                    self.job_input['device_json'] = device_json

                self.job_input.get('input', {}).update(pb_output)

            # create job completion log and update job UVE
            self.result_handler.create_job_summary_log(
                job_template.fq_name)

            # in case of failures, exit the job manager process with failure
            if self.result_handler.job_result_status == JobStatus.FAILURE:
                job_error_msg = self.result_handler.job_summary_message

        except JobException as exp:
            err_msg = "Job Exception recieved: %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            if job_template:
                self.result_handler.create_job_summary_log(
                    job_template.fq_name)
            job_error_msg = err_msg
        except Exception as exp:
            err_msg = "Error while executing job %s " % repr(exp)
            self._logger.error(err_msg)
            self._logger.error("%s" % traceback.format_exc())
            self.result_handler.update_job_status(JobStatus.FAILURE,
                                                  err_msg)
            self.result_handler.create_job_summary_log(job_template.fq_name)
            job_error_msg = err_msg
        finally:
            # need to wait for the last job log and uve update to complete
            # via sandesh and then close sandesh connection
            sandesh_util = SandeshUtils(self._logger)
            sandesh_util.close_sandesh_connection()
            self._logger.info("Closed Sandesh connection")
            if job_error_msg is not None:
                sys.exit(job_error_msg)