def test_workflow_fails_leaf(self): wfj = self.workflow_job(states=['successful', 'successful', 'failed', None, None]) dag = WorkflowDAG(workflow_job=wfj) dag.mark_dnr_nodes() is_done = dag.is_workflow_done() has_failed = dag.has_workflow_failed() self.assertTrue(is_done) self.assertTrue(has_failed)
def test_workflow_not_finished(self): wfj = self.workflow_job(states=['new', None, None, None, None]) dag = WorkflowDAG(workflow_job=wfj) dag.mark_dnr_nodes() is_done = dag.is_workflow_done() has_failed, reason = dag.has_workflow_failed() self.assertFalse(is_done) self.assertFalse(has_failed) assert reason is None
def test_workflow_dnr_because_parent(self, workflow_job_fn): wfj, nodes = workflow_job_fn(states=['successful', None, None, None, None, None,]) dag = WorkflowDAG(workflow_job=wfj) workflow_nodes = dag.mark_dnr_nodes() assert 2 == len(workflow_nodes) assert nodes[3] in workflow_nodes assert nodes[4] in workflow_nodes
def process_finished_workflow_jobs(self, workflow_jobs): result = [] for workflow_job in workflow_jobs: dag = WorkflowDAG(workflow_job) status_changed = False if workflow_job.cancel_flag: workflow_job.workflow_nodes.filter( do_not_run=False, job__isnull=True).update(do_not_run=True) logger.debug( 'Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format) cancel_finished = dag.cancel_node_jobs() if cancel_finished: logger.info( 'Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format) workflow_job.status = 'canceled' workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=['status', 'start_args']) status_changed = True else: workflow_nodes = dag.mark_dnr_nodes() for n in workflow_nodes: n.save(update_fields=['do_not_run']) is_done = dag.is_workflow_done() if not is_done: continue has_failed, reason = dag.has_workflow_failed() logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful') result.append(workflow_job.id) new_status = 'failed' if has_failed else 'successful' logger.debug("Transitioning {} to {} status.".format( workflow_job.log_format, new_status)) update_fields = ['status', 'start_args'] workflow_job.status = new_status if reason: logger.info( f'Workflow job {workflow_job.id} failed due to reason: {reason}' ) workflow_job.job_explanation = gettext_noop( "No error handling paths found, marking workflow as failed" ) update_fields.append('job_explanation') workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=update_fields) status_changed = True if status_changed: workflow_job.websocket_emit_status(workflow_job.status) # Operations whose queries rely on modifications made during the atomic scheduling session workflow_job.send_notification_templates( 'succeeded' if workflow_job.status == 'successful' else 'failed') if workflow_job.spawned_by_workflow: schedule_task_manager() return result
def test_workflow_done(self): wfj = self.workflow_job(states=['failed', None, None, 'successful', None]) dag = WorkflowDAG(workflow_job=wfj) assert 3 == len(dag.mark_dnr_nodes()) is_done = dag.is_workflow_done() has_failed, reason = dag.has_workflow_failed() self.assertTrue(is_done) self.assertFalse(has_failed) assert reason is None # verify that relaunched WFJ fails if a JT leaf is deleted for jt in JobTemplate.objects.all(): jt.delete() relaunched = wfj.create_relaunch_workflow_job() dag = WorkflowDAG(workflow_job=relaunched) dag.mark_dnr_nodes() is_done = dag.is_workflow_done() has_failed, reason = dag.has_workflow_failed() self.assertTrue(is_done) self.assertTrue(has_failed) assert "Workflow job node {} related unified job template missing".format(wfj.workflow_nodes.all()[0].id)
def process_finished_workflow_jobs(self, workflow_jobs): result = [] for workflow_job in workflow_jobs: dag = WorkflowDAG(workflow_job) status_changed = False if workflow_job.cancel_flag: workflow_job.workflow_nodes.filter( do_not_run=False, job__isnull=True).update(do_not_run=True) logger.debug( 'Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format) cancel_finished = dag.cancel_node_jobs() if cancel_finished: logger.info( 'Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format) workflow_job.status = 'canceled' workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=['status', 'start_args']) status_changed = True else: workflow_nodes = dag.mark_dnr_nodes() map(lambda n: n.save(update_fields=['do_not_run']), workflow_nodes) is_done = dag.is_workflow_done() if not is_done: continue has_failed, reason = dag.has_workflow_failed() logger.info('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful') result.append(workflow_job.id) new_status = 'failed' if has_failed else 'successful' logger.debug( six.text_type("Transitioning {} to {} status.").format( workflow_job.log_format, new_status)) update_fields = ['status', 'start_args'] workflow_job.status = new_status if reason: logger.info(reason) workflow_job.job_explanation = "No error handling paths found, marking workflow as failed" update_fields.append('job_explanation') workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=update_fields) status_changed = True if status_changed: workflow_job.websocket_emit_status(workflow_job.status) if workflow_job.spawned_by_workflow: schedule_task_manager() return result
def spawn_workflow_graph_jobs(self): result = [] for workflow_job in self.all_tasks: if self.timed_out(): logger.warning( "Workflow manager has reached time out while processing running workflows, exiting loop early" ) ScheduleWorkflowManager().schedule() # Do not process any more workflow jobs. Stop here. # Maybe we should schedule another WorkflowManager run break dag = WorkflowDAG(workflow_job) status_changed = False if workflow_job.cancel_flag: workflow_job.workflow_nodes.filter( do_not_run=False, job__isnull=True).update(do_not_run=True) logger.debug( 'Canceling spawned jobs of %s due to cancel flag.', workflow_job.log_format) cancel_finished = dag.cancel_node_jobs() if cancel_finished: logger.info( 'Marking %s as canceled, all spawned jobs have concluded.', workflow_job.log_format) workflow_job.status = 'canceled' workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=['status', 'start_args']) status_changed = True else: workflow_nodes = dag.mark_dnr_nodes() WorkflowJobNode.objects.bulk_update(workflow_nodes, ['do_not_run']) # If workflow is now done, we do special things to mark it as done. is_done = dag.is_workflow_done() if is_done: has_failed, reason = dag.has_workflow_failed() logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful') result.append(workflow_job.id) new_status = 'failed' if has_failed else 'successful' logger.debug("Transitioning {} to {} status.".format( workflow_job.log_format, new_status)) update_fields = ['status', 'start_args'] workflow_job.status = new_status if reason: logger.info( f'Workflow job {workflow_job.id} failed due to reason: {reason}' ) workflow_job.job_explanation = gettext_noop( "No error handling paths found, marking workflow as failed" ) update_fields.append('job_explanation') workflow_job.start_args = '' # blank field to remove encrypted passwords workflow_job.save(update_fields=update_fields) status_changed = True if status_changed: if workflow_job.spawned_by_workflow: ScheduleWorkflowManager().schedule() workflow_job.websocket_emit_status(workflow_job.status) # Operations whose queries rely on modifications made during the atomic scheduling session workflow_job.send_notification_templates( 'succeeded' if workflow_job.status == 'successful' else 'failed') if workflow_job.status == 'running': spawn_nodes = dag.bfs_nodes_to_run() if spawn_nodes: logger.debug('Spawning jobs for %s', workflow_job.log_format) else: logger.debug('No nodes to spawn for %s', workflow_job.log_format) for spawn_node in spawn_nodes: if spawn_node.unified_job_template is None: continue kv = spawn_node.get_job_kwargs() job = spawn_node.unified_job_template.create_unified_job( **kv) spawn_node.job = job spawn_node.save() logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk) can_start = True if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate): workflow_ancestors = job.get_ancestor_workflows() if spawn_node.unified_job_template in set( workflow_ancestors): can_start = False logger.info( 'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}' .format(job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors])) display_list = [spawn_node.unified_job_template ] + workflow_ancestors job.job_explanation = gettext_noop( "Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})" ).format(', '.join('<{}>'.format(tmp) for tmp in display_list)) else: logger.debug( 'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}' .format(job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors])) if not job._resources_sufficient_for_launch(): can_start = False job.job_explanation = gettext_noop( "Job spawned from workflow could not start because it was missing a related resource such as project or inventory" ) if can_start: if workflow_job.start_args: start_args = json.loads( decrypt_field(workflow_job, 'start_args')) else: start_args = {} can_start = job.signal_start(**start_args) if not can_start: job.job_explanation = gettext_noop( "Job spawned from workflow could not start because it was not in the right state or required manual credentials" ) if not can_start: job.status = 'failed' job.save(update_fields=['status', 'job_explanation']) job.websocket_emit_status('failed') # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ? # emit_websocket_notification('/socket.io/jobs', '', dict(id=)) return result