def wrapper(*args, **kwargs): module = args[0] try: module._validate_job_ctx() module.job_log_util = JobLogUtils( sandesh_instance_id=str(uuid.uuid4()), config_args=json.dumps(module.job_ctx['config_args'])) function(*args, **kwargs) except ValueError as verr: module.results['msg'] = str(verr) module.results['failed'] = True module.logger.error(str(verr)) except Exception as ex: msg = "Failed object log due to error: %s\n\t \ job name: %s\n\t \ job execution id: %s\n" \ % (str(ex), module.job_ctx['job_template_fqname'], module.job_ctx['job_execution_id']) module.results['msg'] = msg module.results['failed'] = True module.logger.error(msg) finally: try: if module.job_log_util: sandesh_util = SandeshUtils( module.job_log_util.get_config_logger()) sandesh_util.close_sandesh_connection() except Exception as ex: module.logger.error("Unable to close sandesh connection: %s", str(ex))
def initialize_sandesh_logger(self, config_args, sandesh=True, sandesh_instance=None): # parse the logger args args = self.parse_logger_args(config_args) args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) self.args = args # initialize logger logger = JobLogger(args=args, sandesh_instance_id=self.sandesh_instance_id, sandesh_instance=sandesh_instance) if not sandesh_instance and sandesh: try: sandesh_util = SandeshUtils(logger) sandesh_util.wait_for_connection_establish() except JobException: msg = MsgBundle.getMessage( MsgBundle.SANDESH_INITIALIZATION_TIMEOUT_ERROR) raise JobException(msg) logger.info("Sandesh is initialized." " Config logger instance created.") return logger
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() self.job_template = job_template msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list] for i in range(0, len(playbook_list)): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[i] multi_device_playbook = play_info.multi_device_playbook if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent, self._zk_client) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # stop workflow only if its a single device job or # it is a multi device playbook # and all the devices have failed some job execution # declare it as failure and the stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): self._logger.error( "Stop the workflow on the failed Playbook.") break elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log( job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): self._logger.info("Starting Executable") job_error_msg = None job_template = self.job_template try: # create job UVE and log self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=\ job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) executable_list = job_template.get_job_template_executables()\ .get_executable_info() for executable in executable_list: exec_path = executable.get_executable_path() exec_args = executable.get_executable_args() job_input_args = self.gather_job_args() try: exec_process = subprocess32.Popen([exec_path, "--job-input", json.dumps(job_input_args), '--debug', 'True'], close_fds=True, cwd='/', stdout=subprocess32.PIPE, stderr=subprocess32.PIPE) self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "INPROGRESS"}) msg = "Child process pid = " + str(exec_process.pid) self._logger.info(msg) (out, err) = exec_process.communicate(timeout=self.executable_timeout) self._logger.notice(str(out)) self._logger.notice(str(err)) except subprocess32.TimeoutExpired as timeout_exp: if exec_process is not None: os.kill(exec_process.pid, 9) msg = MsgBundle.getMessage( MsgBundle.RUN_EXECUTABLE_PROCESS_TIMEOUT, exec_path=exec_path, exc_msg=repr(timeout_exp)) raise JobException(msg, self.job_execution_id) self._logger.info(exec_process.returncode) self._logger.info("Executable Completed") if exec_process.returncode != 0: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "FAILED"}) msg = MsgBundle.getMessage(MsgBundle. EXECUTABLE_RETURN_WITH_ERROR, exec_uri=exec_path) self._logger.error(msg) else: self.job_file_write.write_to_file( self.job_execution_id, "job_summary", JobFileWrite.JOB_LOG, {"job_status": "COMPLETED"}) except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log msg = MsgBundle.getMessage(MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id) self._logger.debug(msg) self.result_handler = JobResultHandler(self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils) job_template = self.job_utils.read_job_template() timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log(job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] for i in range(0, len(playbook_list)): if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=i + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, i, job_percent) job_mgr.start_job() # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: self._logger.error( "Stop the workflow on the failed Playbook.") break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not self.job_input.get('device_json'): device_json = pb_output.get('device_json') self.job_input['device_json'] = device_json if not self.job_input.get('prev_pb_output'): self.job_input['prev_pb_output'] = pb_output else: self.job_input['prev_pb_output'].update(pb_output) self.job_input.get('input', {}).update(pb_output) # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def start_job(self): job_error_msg = None job_template = None try: # create job UVE and log job_template = self.job_utils.read_job_template() self.job_template = job_template self.job_description = self.job_template.display_name if not self.job_transaction_descr: self.job_transaction_descr = self._generate_transaction_descr() self.result_handler = JobResultHandler( self.job_template_id, self.job_execution_id, self.fabric_fq_name, self._logger, self.job_utils, self.job_log_utils, self.device_name, self.job_description, self.job_transaction_id, self.job_transaction_descr) msg = MsgBundle.getMessage( MsgBundle.START_JOB_MESSAGE, job_execution_id=self.job_execution_id, job_template_name=job_template.fq_name[-1]) self._logger.debug(msg) timestamp = int(round(time.time() * 1000)) self.job_log_utils.send_job_log( job_template.fq_name, self.job_execution_id, self.fabric_fq_name, msg, JobStatus.STARTING.value, timestamp=timestamp, device_name=self.device_name, description=self.job_description, transaction_id=self.job_transaction_id, transaction_descr=self.job_transaction_descr) # validate job input if required by job_template input_schema input_schema = job_template.get_job_template_input_schema() if input_schema: self._validate_job_input(input_schema, self.job_data) playbook_list = job_template.get_job_template_playbooks()\ .get_playbook_info() job_percent = None # calculate job percentage for each playbook if len(playbook_list) > 1: task_weightage_array = [ pb_info.job_completion_weightage for pb_info in playbook_list ] cleanup_in_progress = False cleanup_completed = False pb_idx = 0 while pb_idx < len(playbook_list): # check if its a multi device playbook playbooks = job_template.get_job_template_playbooks() play_info = playbooks.playbook_info[pb_idx] multi_device_playbook = play_info.multi_device_playbook playbook_name = play_info.playbook_uri.split('/')[-1] if cleanup_in_progress: # If we need to cleanup due to a previous error, ignore # any playbooks that don't perform recovery if not play_info.recovery_playbook: self._logger.info("Ignoring playbook %s since it " "does not perform recovery" % playbook_name) pb_idx += 1 continue # If we are running a recovery playbook, then # cleanup_completed needs to be set irrespective of # a success or error in recovery playbook execution else: self._logger.info("Running recovery playbook %s" % playbook_name) cleanup_completed = True else: # Don't run a recovery playbook if we haven't hit an error if play_info.recovery_playbook: self._logger.info( "Ignoring recovery playbook %s since we " "haven't hit an error" % playbook_name) pb_idx += 1 continue if len(playbook_list) > 1: # get the job percentage based on weightage of each plabook # when they are chained job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100, task_seq_number=pb_idx + 1, task_weightage_array=task_weightage_array)[0] else: job_percent = \ self.job_log_utils.calculate_job_percentage( len(playbook_list), buffer_task_percent=True, total_percent=100)[0] # using equal weightage retry_devices = None while True: job_mgr = JobManager(self._logger, self._vnc_api, self.job_input, self.job_log_utils, job_template, self.result_handler, self.job_utils, pb_idx, job_percent, self._zk_client, self.job_description, self.job_transaction_id, self.job_transaction_descr) self.job_mgr = job_mgr job_mgr.start_job() # retry the playbook execution if retry_devices is added to # the playbook output job_status = self.result_handler.job_result_status retry_devices = self.result_handler.get_retry_devices() failed_device_list = self.result_handler\ .get_failed_device_list() if job_status == JobStatus.FAILURE or not retry_devices \ or self.abort_flag: break self.job_input['device_json'] = retry_devices self.job_input['input']['failed_list'] = failed_device_list # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} if pb_output.get('early_exit'): break # stop the workflow if playbook failed if self.result_handler.job_result_status == JobStatus.FAILURE: # If it is a single device job or # if it is a multi device playbook # and all the devices have failed some job execution, # declare it as failure, perform cleanup if possible # and then stop the workflow if not multi_device_playbook or \ (multi_device_playbook and len(self.result_handler.failed_device_jobs) == len(self.job_input.get('device_json'))): if not cleanup_in_progress: cleanup_in_progress = True pb_idx = 0 self._logger.info("Stop the workflow on the failed" " Playbook and start cleanup") else: pb_idx += 1 continue elif not retry_devices: # it is a multi device playbook but one of # the device jobs have failed. This means we should # still declare the operation as success. We declare # workflow as success even if one of the devices has # succeeded the job self.result_handler.job_result_status =\ JobStatus.SUCCESS if self.abort_flag: err_msg = "ABORTING NOW..." self._logger.info(err_msg) self.result_handler.update_job_status( JobStatus.FAILURE, err_msg) break # update the job input with marked playbook output json pb_output = self.result_handler.playbook_output or {} # read the device_data output of the playbook # and update the job input so that it can be used in next # iteration if not multi_device_playbook: device_json = pb_output.pop('device_json', None) self.job_input['device_json'] = device_json self.job_input.get('input', {}).update(pb_output) pb_idx += 1 # A successful recovery playbook execution might # set JobStatus to success but this does not indicate a # success in the workflow. Set JobStatus to failure again. if cleanup_completed: err_msg = "Finished cleaning up after the error" self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) cleanup_completed = False cleanup_in_progress = False # create job completion log and update job UVE self.result_handler.create_job_summary_log(job_template.fq_name) # in case of failures, exit the job manager process with failure if self.result_handler.job_result_status == JobStatus.FAILURE: job_error_msg = self.result_handler.job_summary_message except JobException as exp: err_msg = "Job Exception recieved: %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) if job_template: self.result_handler.create_job_summary_log( job_template.fq_name) job_error_msg = err_msg except Exception as exp: err_msg = "Error while executing job %s " % repr(exp) self._logger.error(err_msg) self._logger.error("%s" % traceback.format_exc()) self.result_handler.update_job_status(JobStatus.FAILURE, err_msg) self.result_handler.create_job_summary_log(job_template.fq_name) job_error_msg = err_msg finally: # need to wait for the last job log and uve update to complete # via sandesh and then close sandesh connection sandesh_util = SandeshUtils(self._logger) sandesh_util.close_sandesh_connection() self._logger.info("Closed Sandesh connection") if job_error_msg is not None: sys.exit(job_error_msg)
def close_sandesh_conn(self): try: sandesh_util = SandeshUtils(self.job_log_util.get_config_logger()) sandesh_util.close_sandesh_connection() except Exception as e: logging.error("Unable to close sandesh connection: %s", str(e))