repr(exp)) # initialize zk zk_client = None try: zk_client = initialize_zookeeper_client(logger._args) logger.info("Zookeeper client is initialized.") except Exception as exp: handle_init_failure(job_input_json, MsgBundle.ZK_INIT_FAILURE, repr(exp)) job_utils = JobUtils(job_input_json.get('job_execution_id'), job_input_json.get('job_template_id'), logger, vnc_api) job_template = job_utils.read_job_template() template_type = job_template.get_job_template_type() logger.info(template_type) # invoke job manager try: if template_type == "executable": exec_manager = ExecutableManager(logger, vnc_api, job_input_json, job_log_utils) logger.info("Executable Manager is initialized. Starting job.") exec_manager.start_job() else: workflow_manager = WFManager(logger, vnc_api, job_input_json, job_log_utils, zk_client)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.db_init_params = None self.job_log_utils = job_log_utils self.job_execution_id = None self.job_template_id = None self.device_json = None self.result_handler = None self.job_data = None self.cluster_id = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.db_init_params = job_input_json.get('db_init_params') self.cluster_id = job_input_json.get('cluster_id') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, str): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client, self.db_init_params, self.cluster_id) job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices: break self.job_input['device_json'] = retry_devices # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if self.job_input.get('device_json') is None or\ len(self.result_handler.failed_device_jobs)\ == len(self.job_input.get('device_json')): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of the device jobs # have failed. This means we should still declare # the operation as success. We declare workflow as # success even if one of the devices has succeeded the job self.result_handler.job_result_status = JobStatus.SUCCESS # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): """Initializes workflow manager.""" self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.job_log_utils = job_log_utils self.job_execution_id = None self.job_description = None self.job_transaction_id = None self.job_transaction_descr = None self.job_template_id = None self.device_json = None self.device_name = "" self.result_handler = None self.job_data = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client self.job_mgr = None self.job_template = None self.abort_flag = False signal.signal(signal.SIGABRT, self.job_mgr_abort_signal_handler) signal.signal(signal.SIGUSR1, self.job_mgr_abort_signal_handler) logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage( MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.device_json = job_input_json.get('device_json') self.job_description = job_input_json.get('job_description', "") self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_transaction_id = \ job_input_json.get('job_transaction_id', self.job_execution_id) self.job_transaction_descr = \ job_input_json.get('job_transaction_descr') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.device_name = self._get_device_name() def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage( MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def _generate_transaction_descr(self): transaction_descr = self.job_template.display_name if self.device_json: transaction_descr += " for " device_uuid_list = list(self.device_json.keys()) for device_uuid in device_uuid_list: device_info = self.device_json[device_uuid] device_fqname = device_info.get('device_fqname') if device_fqname: device_name = device_fqname[-1] transaction_descr += device_name + " " if len(device_uuid_list) == 1: self.device_name = device_name return transaction_descr def _get_device_name(self): if self.device_json: device_uuid_list = list(self.device_json.keys()) if len(device_uuid_list) == 1: device_info = self.device_json[device_uuid_list[0]] device_fqname = device_info.get('device_fqname') if device_fqname: return device_fqname[-1] return "" def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log job_template = self.job_utils.read_job_template() self.job_template = job_template self.job_description = self.job_template.display_name if not self.job_transaction_descr: self.job_transaction_descr = self._generate_transaction_descr() self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils, self.device_name, self.job_description, self.job_transaction_id, self.job_transaction_descr) msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log( job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp, device_name=self.device_name, description=self.job_description, transaction_id=self.job_transaction_id, transaction_descr=self.job_transaction_descr) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[i] multi_device_playbook = play_info.multi_device_playbook if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client, self.job_description, self.job_transaction_id, self.job_transaction_descr) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() failed_device_list = self.result_handler.get_failed_device_list() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices self.job_input['input']['failed_list'] = failed_device_list # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg) def job_mgr_abort_signal_handler(self, signalnum, frame): if signalnum == signal.SIGABRT: # Force abort; kill all playbooks, then exit err_msg = "Job aborting..." self._logger.info(err_msg) try: self.job_mgr.job_handler.playbook_abort() self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log( self.job_template.fq_name) sys.exit() except Exception: self._logger.error("Failed to force abort") elif signalnum == signal.SIGUSR1: # Graceful abort; Exit after current playbook self._logger.info("Job will abort upon playbook completion...") self.abort_flag = True
class ExecutableManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils): self._logger = logger self.vnc_api = vnc_api self.vnc_api_init_params = None self.api_server_host = None self.auth_token = None self.contrail_cluster_id = None self.sandesh_args = None self.job_log_utils = job_log_utils self.job_input = job_input self.job_utils = None self.executable_timeout = 1800 self.job_template = None self.job_execution_id = None self.job_template_id = None self.result_handler = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self.vnc_api) self.job_template = self.job_utils.read_job_template() self.job_file_write = JobFileWrite(self._logger) def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.auth_token = job_input_json.get('auth_token') self.contrail_cluster_id = job_input_json.get('contrail_cluster_id') self.sandesh_args = job_input_json.get('args') self.vnc_api_init_params = job_input_json.get('vnc_api_init_params') self.api_server_host = job_input_json.get('api_server_host') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage(MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, basestring): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.error("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def gather_job_args(self): extra_vars = { 'input': self.job_data, 'job_template_id': self.job_template.get_uuid(), 'job_template_fqname': self.job_template.fq_name, 'fabric_fq_name': self.fabric_fq_name, 'auth_token': self.auth_token, 'contrail_cluster_id': self.contrail_cluster_id, 'api_server_host': self.api_server_host, 'job_execution_id': self.job_execution_id, 'sandesh_args': self.sandesh_args, 'vnc_api_init_params': self.vnc_api_init_params, } return extra_vars def start_job(self): self._logger.info("Starting Executable") job_error_msg = None job_template = self.job_template try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) executable_list = job_template.get_job_template_executables()\ .get_executable_info() for executable in executable_list: exec_path = executable.get_executable_path() exec_args = executable.get_executable_args() job_input_args = self.gather_job_args() try: exec_process = subprocess32.Popen([ exec_path, exec_args, "-i", json.dumps(job_input_args) ], close_fds=True, cwd='/') self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "INPROGRESS"}) msg = "Child process pid = " + str(exec_process.pid) self._logger.info(msg) exec_process.wait(timeout=self.executable_timeout) except subprocess32.TimeoutExpired as timeout_exp: if exec_process is not None: os.kill(exec_process.pid, 9) msg = MsgBundle.getMessage( MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT, exec_path=exec_path, exc_msg=repr(timeout_exp)) raise JobException(msg, self.job_execution_id) self._logger.info(exec_process.returncode) self._logger.info("Executable Completed") self.job_file_write.write_to_file(self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "COMPLETED"}) if exec_process.returncode != 0: self.job_file_write.write_to_file(self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "FAILED"}) msg = MsgBundle.getMessage( MsgBundle.EXECUTABLE_RETURN_WITH_ERROR, exec_uri=exec_path) raise JobException(msg, self.job_execution_id) except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
class WFManager(object): def __init__(self, logger, vnc_api, job_input, job_log_utils, zk_client): self._logger = logger self._vnc_api = vnc_api self.job_input = job_input self.db_init_params = None self.job_log_utils = job_log_utils self.job_execution_id = None self.job_template_id = None self.device_json = None self.result_handler = None self.job_data = None self.cluster_id = None self.fabric_fq_name = None self.parse_job_input(job_input) self.job_utils = JobUtils(self.job_execution_id, self.job_template_id, self._logger, self._vnc_api) self._zk_client = zk_client logger.debug("Job manager initialized") def parse_job_input(self, job_input_json): # job input should have job_template_id and execution_id field if job_input_json.get('job_template_id') is None: msg = MsgBundle.getMessage(MsgBundle.JOB_TEMPLATE_MISSING) raise Exception(msg) if job_input_json.get('job_execution_id') is None: msg = MsgBundle.getMessage( MsgBundle.JOB_EXECUTION_ID_MISSING) raise Exception(msg) self.job_template_id = job_input_json.get('job_template_id') self.job_execution_id = job_input_json.get('job_execution_id') self.job_data = job_input_json.get('input') self.fabric_fq_name = job_input_json.get('fabric_fq_name') self.db_init_params = job_input_json.get('db_init_params') self.cluster_id = job_input_json.get('cluster_id') def _validate_job_input(self, input_schema, ip_json): if ip_json is None: msg = MsgBundle.getMessage( MsgBundle.INPUT_SCHEMA_INPUT_NOT_FOUND) raise JobException(msg, self.job_execution_id) try: ip_schema_json = input_schema if isinstance(input_schema, str): ip_schema_json = json.loads(input_schema) jsonschema.validate(ip_json, ip_schema_json) self._logger.debug("Input Schema Validation Successful" "for template %s" % self.job_template_id) except Exception as exp: msg = MsgBundle.getMessage(MsgBundle.INVALID_SCHEMA, job_template_id=self.job_template_id, exc_obj=exp) raise JobException(msg, self.job_execution_id) def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client, self.db_init_params, self.cluster_id) job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices: break self.job_input['device_json'] = retry_devices # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if self.job_input.get('device_json') is None or\ len(self.result_handler.failed_device_jobs)\ == len(self.job_input.get('device_json')): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of the device jobs # have failed. This means we should still declare # the operation as success. We declare workflow as # success even if one of the devices has succeeded the job self.result_handler.job_result_status = JobStatus.SUCCESS # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)