def _simulate(self): """Simulate execution of active jobs.""" tokens = self._store.read_tokens() satisfied_deps = set() executed_jobs = [] jobs = {} for token in tokens: event_name = Name.from_event_token_name(token.name) if event_name.event: satisfied_deps.add((event_name.input, event_name.job)) else: job_name = Name.from_job_token_name(token.name) if job_name.job: job = pickle.loads(token.data) jobs[job.name] = job dep_counts = collections.defaultdict(int) while satisfied_deps: last_satisfied_deps = satisfied_deps satisfied_deps = set() for (_, job_name) in last_satisfied_deps: dep_counts[job_name] += 1 if dep_counts[job_name] == 2: executed_jobs.append(job_name) job = jobs[job_name] for output in job.outputs: satisfied_deps.add((job_name, output)) return executed_jobs
def _get_jobs(self, workflow, job): """Get job definitions from the store across all workflow instances. Args: workflow: The name of the job workflow. instance: The name of the job instance. job: The name of the job. Returns: Matching job definition. """ name = Name(workflow=workflow) name_prefix = name.get_workflow_prefix() # This is a bit hacky since we bypass the Name module where all the # token naming logic is supposed to be located. # TODO(pawel): extend the Name module to support abstractions needed # here. name_infix = '/job/' name_suffix = '/%s' % job job_tokens = self._store.read_tokens(name_prefix=name_prefix, name_infix=name_infix, name_suffix=name_suffix) result = [] for job_token in job_tokens: job_record = pickle.loads(job_token.data) result.append(job_record) return result
def _get_instance_using_cache(self, workflow, instance): """Get workflow instance, preferably from the cache. As a side effect, if the instance is archived and it does not exist in the cache, it will be added to the cache. Args: workflow: The name of the workflow whose instance we are interested in. instance: The instance we are interested in. Returns: The workflow instance or None if it was not found. """ name = Name(workflow=workflow, instance=instance) instance_prefix = name.get_instance_prefix() data = self._store.get_cached_data(instance_prefix) if data: instance_data = pickle.loads(data) else: # Cache only archived instances. if self._store.read_archived_token_names( name_prefix=instance_prefix): # The ordering of operations is important. We need to make # sure that we add to the cache instance data constructed from # the archived tokens. instance_data = self._get_instance_no_cache(workflow, instance) self._store.set_cached_data(instance_prefix, pickle.dumps(instance_data)) else: instance_data = self._get_instance_no_cache(workflow, instance) return instance_data
def _get_instances_using_cache(self, workflow): """Get workflow instances, preferably from the cache. As a side effect, archived instances that do not exist in the cache will be added to the cache. Args: workflow: The name of the workflow whose instances we are interested in. Returns: List of instances for the given workflow. """ name = Name(workflow=workflow) workflow_prefix = name.get_workflow_prefix() workflow_token_names = self._store.read_token_names( name_prefix=workflow_prefix) instances_prefixes = DataBuilder._get_instance_prefixes( workflow_token_names) result = [] for prefix in instances_prefixes: name = Name.from_instance_prefix(prefix) assert name.workflow and name.instance, ( 'Expected instance prefix, found %s' % prefix) result.append(self.get_instance(name.workflow, name.instance)) return result
def _is_done(self, workflow, instance): """Check if the workflow instance is done. A workflow is done if it does not have runnable jobs. Returns: True if we are certain that the workflow is not running. Otherwise False. If there were any errors during communication with the master, the return value is False. """ # Attempt to make the workflow runnable and verify that no WAITING job # tokens were changed in the meantime. name = Name(workflow=workflow, instance=instance, job_state=Name.WAITING_STATE) query = Query(namePrefix=name.get_job_state_prefix()) request = QueryRequest(queries=[query]) try: snapshot = Snapshot(self._client, request) except: LOG.exception('error sending request %s', request) return False if not self._make_runnable(workflow, instance): return False if not self._has_no_runnable_jobs(workflow, instance): return False try: return not snapshot.refresh() except: LOG.exception('error sending request %s', request) return False
def _query_and_own_runnable_job_token(self, workflow, instance): """Attempt to own a runnable job token from a given workflow instance. Try to own a runnable job token in a given workflow instance. The ownership of the qualifying job token lasts for a limited time so it has to be periodically renewed. Args: workflow: The name of the workflow whose jobs should be considered. instance: The workflow instance whose jobs should be considered. """ assert not self._owned_job_token name = Name(workflow=workflow, instance=instance, job_state=Name.RUNNABLE_STATE) query = Query() query.namePrefix = name.get_job_state_prefix() query.maxTokens = 1 request = QueryAndOwnRequest() request.query = query request.expirationTime = time.time() + Worker._LEASE_TIME_SEC request.owner = self._name try: response = self._client.query_and_own(request) if response.tokens: assert len(response.tokens) == 1 self._owned_job_token = response.tokens[0] except TokenMasterException: LOG.exception('error sending request %s', request)
def get_schedule(self, workflow): """Get workflow schedule data from the store. Args: workflow: The name of the workflow whose schedule should be retrieved. Returns: The workflow schedule or None if it was not found. """ name = Name(workflow=workflow) schedule_token_name = name.get_workflow_schedule_token_name() tokens = self._store.read_tokens(name_prefix=schedule_token_name) if tokens: for token in tokens: if token.name == schedule_token_name: schedule = pickle.loads(token.data) overrun_policy_help = OverrunPolicy.get_help( schedule.overrun_policy) return WorkflowScheduleData( next_run_time=schedule.next_run_time, recurrence_seconds=schedule.recurrence_seconds, overrun_policy=schedule.overrun_policy, overrun_policy_help=overrun_policy_help, workflow=schedule.workflow, parser_params=schedule.parser_params, emails=schedule.emails, max_running_instances=schedule.max_running_instances) return None
def _make_runnable(self, workflow, instance): """Attempt to make jobs in a given workflow instance runnable. Go over all waiting jobs in a given workflow instance and try to make them runnable. Args: workflow: The name of the workflow whose jobs should be considered. instance: The workflow instance whose jobs should be considered. Returns: True if there were no errors during communication with the master, otherwise False. """ name = Name() name.workflow = workflow name.instance = instance name.job_state = Name.WAITING_STATE query = Query(namePrefix=name.get_job_state_prefix()) # TODO(pawel): to prevent multiple workers from trying to make the # same job runnable at the same time, this should be a # QueryAndOwnRequest. Note that the current implementation is correct, # just inefficient. request = QueryRequest(queries=[query]) try: response = self._client.query(request) except TokenMasterException: LOG.exception('error sending request %s', request) return False assert len(response.tokens) == 1 for token in response.tokens[0]: if not self._make_job_runnable(token): return False return True
def get_workflow_tokens(self): """Create Pinball tokens representing a workflow instance. Convert workflow jobs to tokens and create event tokens in inputs of top-level jobs. Returns: A list of job and event tokens representing a workflow instance. """ all_jobs = self._get_transitive_deps() instance = get_unique_workflow_instance() result = [] for job in all_jobs: result.append(job.get_job_token(self.name, instance)) top_level_jobs = self._get_top_level_jobs() for job in top_level_jobs: event = Event(creator='parser') event_name = Name(workflow=self.name, instance=instance, job=job.name, input_name=Name.WORKFLOW_START_INPUT, event='workflow_start_event') result.append(Token(name=event_name.get_event_token_name(), data=pickle.dumps(event))) return result
def is_signal_set(self, workflow, instance, action): """Check if a signal is set. Args: workflow: The workflow whose signal should be checked. If None, signals at the global level are checked. instance: The workflow instance whose signal should be checked. If not None, a matching workflow name must be provided. If None, signals at the workflow and the global level are checked. action: The signal action to check. Returns: True iff the signal exists in the specified context. """ for (workflow_name, instance_name) in [(workflow, instance), (workflow, None), (None, None)]: name = Name(workflow=workflow_name, instance=instance_name, signal=Signal.action_to_string(action)) token_name = name.get_signal_token_name() tokens = self._store.read_tokens(token_name) assert len(tokens) <= 1 if tokens: return True return False
def set_action(self, action): """Send a signal with a specific action to the master. Local signal store gets updated with the new action if it is successfully submitted to the master. If the communication with the master fails, locally stored signals get refreshed. Args: action: The action to set. """ attributes = {} if action == Signal.ABORT: attributes[Signal.TIMESTAMP_ATTR] = time.time() elif action == Signal.EXIT: attributes[Signal.GENERATION_ATTR] = PinballConfig.GENERATION signal = self._signals.get(action) if signal and signal.attributes == attributes: return # A signal with the same action but different data may already exist # in the master. signal_token = self._get_signal_token(action) if not signal_token: name = Name(workflow=self._workflow, instance=self._instance, signal=Signal.action_to_string(action)) signal_token = Token(name=name.get_signal_token_name()) signal = Signal(action, attributes) signal_token.data = pickle.dumps(signal) request = ModifyRequest(updates=[signal_token]) if self._send_request(request): self._signals[action] = signal
def _read_tokens_from_store(self, store): """Read archived job tokens from the store. Args: store: The store to read tokens from. """ name = Name(workflow=self._workflow, instance=self._instance) tokens = store.read_archived_tokens( name_prefix=name.get_instance_prefix()) self._filter_job_tokens(tokens)
def _post_signal_tokens(self): """Add some signal tokens to the master.""" request = ModifyRequest(updates=[]) signal = Signal(action=Signal.EXIT) name = Name(signal='exit') signal_token = Token(name=name.get_signal_token_name()) signal_token.data = pickle.dumps(signal) request.updates.append(signal_token) signal = Signal(action=Signal.DRAIN) name.signal = 'drain' name.workflow = 'some_workflow' signal_token = Token(name=name.get_signal_token_name()) signal_token.data = pickle.dumps(signal) request.updates.append(signal_token) name.instance = '123' signal_token = Token(name=name.get_signal_token_name()) signal_token.data = pickle.dumps(signal) request.updates.append(signal_token) signal = Signal(action=Signal.ABORT) name.signal = 'abort' signal_token = Token(name=name.get_signal_token_name()) signal_token.data = pickle.dumps(signal) request.updates.append(signal_token) client = self._factory.get_client() client.modify(request)
def _get_schedule_token(): name = Name(workflow='workflow_0') now = int(time.time()) token = Token(name=name.get_workflow_schedule_token_name(), owner='some_owner', expirationTime=now - 10) schedule = WorkflowSchedule(next_run_time=now - 10, recurrence_seconds=10, workflow='workflow_0') token.data = pickle.dumps(schedule) return token
def _post_workflow_start_event_token(self): name = Name(workflow='some_workflow', instance='12345', job='parent_job', input_name=Name.WORKFLOW_START_INPUT, event='workflow_start_event') event = Event(creator='SimpleWorkflowTest') token = Token(name=name.get_event_token_name(), data=pickle.dumps(event)) request = ModifyRequest(updates=[token]) self._client.modify(request)
def _get_child_job_token(self): name = Name(workflow='some_workflow', instance='12345', job_state=Name.WAITING_STATE, job='child_job') job = ShellJob(name=name.job, inputs=['parent_job'], outputs=[], command='echo child', emails=['*****@*****.**']) return Token(name=name.get_job_token_name(), data=pickle.dumps(job))
def _generate_signal_tokens(workflows): result = [] for w in range(0, workflows, 2): workflow = 'workflow_%d' % w signal = Signal(Signal.DRAIN) name = Name(workflow=workflow, signal=Signal.action_to_string(signal.action)) result.append(Token(name=name.get_signal_token_name(), version=10000000000 * w, data=pickle.dumps(signal))) return result
def _get_output_event_tokens(self, job): """Create output event tokens for the owned job token. Args: job: The job which output tokens should be generated. Returns: A list of event tokens corresponding to the outputs of the owned job token. """ assert self._owned_job_token job_name = Name.from_job_token_name(self._owned_job_token.name) output_name = Name() output_name.workflow = job_name.workflow output_name.instance = job_name.instance output_name.input = job_name.job event_tokens = [] for output in job.outputs: output_name.job = output output_name.event = get_unique_name() event = Event(creator=self._name) assert job.history execution_record = job.history[-1] event.attributes = execution_record.get_event_attributes() event_tokens.append(Token(name=output_name.get_event_token_name(), data=pickle.dumps(event))) return event_tokens
def _post_event_tokens(self): """Add some event tokens to the master.""" request = ModifyRequest(updates=[]) name = Name(workflow='some_workflow', instance='12345') for job_id in range(0, 2): for input_id in range(0, 2): for event_id in range(0, 2): name.job = 'some_job_%d' % job_id name.input = 'some_input_%d' % input_id name.event = 'some_event_%d' % event_id event_token = Token(name=name.get_event_token_name()) request.updates.append(event_token) client = self._factory.get_client() client.modify(request)
def _post_job_tokens(self): """Add some job tokens to the master.""" request = ModifyRequest(updates=[]) name = Name(workflow='some_workflow', instance='12345') for job_id in range(0, 2): if job_id % 2 == 0: name.job_state = Name.WAITING_STATE else: name.job_state = Name.RUNNABLE_STATE name.job = 'some_job_%d' % job_id job_token = Token(name=name.get_job_token_name()) request.updates.append(job_token) client = self._factory.get_client() client.modify(request)
def _read_tokens_from_client(self, client): """Read archived job tokens from the client. Args: client: The client to read tokens from. """ name = Name(workflow=self._workflow, instance=self._instance) query = Query(namePrefix=name.get_workflow_prefix()) request = QueryRequest(queries=[query]) response = client.query(request) assert len(response.tokens) == 1 tokens = response.tokens[0] self._filter_job_tokens(tokens) self._filter_event_tokens(tokens)
def _has_abort_token(self, tokens): """Check if a list of tokens contains an abort token. Args: tokens: The list of tokens to check. Returns: True iff the list contains an abort token. """ abort_signal = Signal.action_to_string(Signal.ABORT) abort_name = Name(workflow=self._workflow, instance=self._instance, signal=abort_signal) abort_token_name = abort_name.get_signal_token_name() for token in tokens: if token.name == abort_token_name: return True return False
def get_workflow_instances(self, workflow_name): """Return list of instances of a given workflow.""" request = GroupRequest() name = Name() name.workflow = workflow_name request.namePrefix = name.get_workflow_prefix() request.groupSuffix = Name.DELIMITER response = self._client.group(request) instance_names = [] if response.counts: for prefix in response.counts.keys(): name = Name.from_instance_prefix(prefix) if name.instance: instance_names.append(name.instance) return instance_names
def _get_signal(self, workflow, instance, action, active): signal = Signal.action_to_string(action) name = Name(workflow=workflow, instance=instance, signal=signal) signal_token_name = name.get_signal_token_name() if active: tokens = self._store.read_active_tokens( name_prefix=signal_token_name) else: tokens = self._store.read_archived_tokens( name_prefix=signal_token_name) if not tokens: return None assert len(tokens) == 1 assert tokens[0].name == signal_token_name return pickle.loads(tokens[0].data)
def _execute_job(self): """Execute the owned job.""" assert self._owned_job_token job = pickle.loads(self._owned_job_token.data) name = Name.from_job_token_name(self._owned_job_token.name) self._executor = JobExecutor.from_job(name.workflow, name.instance, name.job, job, self._data_builder, self._emailer) success = self._executor.prepare() if success: self._owned_job_token.data = pickle.dumps(self._executor.job) success = self._update_owned_job_token() if success: self._start_renew_ownership() success = self._executor.execute() self._stop_renew_ownership() if success: self._move_job_token_to_waiting(self._executor.job, True) elif self._executor.job.retry(): self._keep_job_token_in_runnable(self._executor.job) else: signaller = Signaller(self._client, name.workflow, name.instance) # If ARCHIVE is not set, this is the first failed job in the # workflow. first_failure = not signaller.is_action_set(Signal.ARCHIVE) self._move_job_token_to_waiting(self._executor.job, False) self._send_job_failure_emails(first_failure) self._executor = None self._owned_job_token = None # If needed, archive the workflow. self._process_signals(name.workflow, name.instance)
def _send_job_failure_emails(self, first_failure): assert self._owned_job_token name = Name.from_job_token_name(self._owned_job_token.name) job = self._executor.job emails = set(job.emails) if first_failure: schedule_data = self._data_builder.get_schedule(name.workflow) if schedule_data: emails.update(schedule_data.emails) else: LOG.warning('no schedule found for workflow %s', name.workflow) if emails: execution = len(job.history) - 1 job_execution_data = self._data_builder.get_execution( name.workflow, name.instance, name.job, execution) try: self._emailer.send_job_execution_end_message( list(emails), job_execution_data) except: LOG.exception('error sending job failure email for ' 'workflow %s instance %s job %s execution %d', name.workflow, name.instance, name.job, execution)
def test_job_state_prefix(self): PREFIX = "/workflow/some_workflow/some_instance/job/waiting/" name = Name.from_job_state_prefix(PREFIX) self.assertEqual("some_workflow", name.workflow) self.assertEqual("some_instance", name.instance) self.assertEqual("waiting", name.job_state) self.assertEqual(PREFIX, name.get_job_state_prefix())
def _generate_schedule_tokens(workflows): result = [] for w in range(workflows): next_run_time = time.time() + (365 + w) * 24 * 60 * 60 recurrence = min(365 * 24 * 60 * 60, 60 ** w) workflow = 'workflow_%d' % w schedule = WorkflowSchedule(next_run_time, recurrence_seconds=recurrence, overrun_policy=w % 4, workflow=workflow) name = Name(workflow=workflow) result.append(Token(name=name.get_workflow_schedule_token_name(), version=100000000 * w, owner='some_owner', expirationTime=next_run_time, data=pickle.dumps(schedule))) return result
def get_workflow_jobs_from_parser(workflow): config_parser = load_path(PinballConfig.PARSER)(PinballConfig.PARSER_PARAMS) tokens = config_parser.get_workflow_tokens(workflow) jobs_data = [] for token in tokens: name = Name.from_job_token_name(token.name) if name.job: assert name.workflow == workflow job = pickle.loads(token.data) jobs_data.append(JobData(workflow=workflow, instance=None, job=name.job, job_type=job.__class__.__name__, is_condition=job.IS_CONDITION, info=job.info(), inputs=job.inputs, outputs=job.outputs, emails=job.emails, max_attempts=job.max_attempts, retry_delay_sec=job.retry_delay_sec, warn_timeout_sec=job.warn_timeout_sec, abort_timeout_sec=job.abort_timeout_sec, priority=token.priority, status=Status.NEVER_RUN)) return jobs_data
def test_input_prefix(self): PREFIX = "/workflow/some_workflow/some_instance/input/some_job/" "some_input/" name = Name.from_input_prefix(PREFIX) self.assertEqual("some_workflow", name.workflow) self.assertEqual("some_instance", name.instance) self.assertEqual("some_job", name.job) self.assertEqual("some_input", name.input) self.assertEqual(PREFIX, name.get_input_prefix())
def test_move_job_token_to_runnable(self): self._post_job_tokens() self._post_workflow_start_event_token() job_name = Name(workflow='some_workflow', instance='12345', job_state=Name.WAITING_STATE, job='parent_job') job_token = self._get_token(job_name.get_job_token_name()) event_name = Name(workflow='some_workflow', instance='12345', job='parent_job', input_name=Name.WORKFLOW_START_INPUT, event='workflow_start_event') event_token = self._get_token(event_name.get_event_token_name()) self._worker._move_job_token_to_runnable(job_token, [event_token]) # Event token should have been removed and the parent job should be # runnable. self._verify_parent_job_runnable()
def get_schedule_token(self, workflow): schedule_config = self._repository.get_schedule(workflow) timestamp = schedule_to_timestamp(schedule_config.time, schedule_config.start_date) recurrence = recurrence_str_to_sec(schedule_config.recurrence) overrun_policy = OverrunPolicy.from_string( schedule_config.overrun_policy) schedule = WorkflowSchedule( next_run_time=timestamp, recurrence_seconds=recurrence, overrun_policy=overrun_policy, workflow=schedule_config.workflow, emails=schedule_config.emails, #TODO(mao): to make it flexible that allow users specify through UI max_running_instances=PinballConfig. DEFAULT_MAX_WORKFLOW_RUNNING_INSTANCES) schedule.advance_next_run_time() timestamp = schedule.next_run_time token_name = (Name(workflow=schedule_config.workflow). get_workflow_schedule_token_name()) return Token(name=token_name, owner='parser', expirationTime=timestamp, data=pickle.dumps(schedule))
def test_workflow_schedule_token_name(self): NAME = '/schedule/workflow/some_workflow' name = Name.from_workflow_schedule_token_name(NAME) self.assertEqual('some_workflow', name.workflow) self.assertEqual(NAME, name.get_workflow_schedule_token_name())
def _instance_data_from_job_tokens(self, job_tokens): """Extract instance data from job tokens in that instance. Args: job_tokens: Job tokens that belong to a single workflow instance. Returns: Workflow data describing the workflow instance identified by the input job tokens. """ assert job_tokens start_time = time.time() end_time = 0 failed = False for job_token in job_tokens: job = pickle.loads(job_token.data) if job.history: first_execution_record = job.history[0] if (first_execution_record.start_time and first_execution_record.start_time < start_time): start_time = first_execution_record.start_time last_execution_record = job.history[-1] if not last_execution_record.end_time: end_time = sys.maxint else: if last_execution_record.end_time > end_time: end_time = last_execution_record.end_time if (not job.disabled and last_execution_record.exit_code != 0): failed = True if not job_tokens: is_active = False else: is_active = True job_name = job_tokens[0].name archived_tokens = self._store.read_archived_tokens( name_prefix=job_name) for token in archived_tokens: if token.name == job_name: is_active = False break name = Name.from_job_token_name(job_tokens[0].name) is_scheduled_for_archive = False abort_signal = None if is_active: archive_signal = self._get_signal(name.workflow, name.instance, Signal.ARCHIVE, True) is_scheduled_for_archive = (archive_signal and Signal.TIMESTAMP_ATTR in archive_signal.attributes) else: abort_signal = self._get_signal(name.workflow, name.instance, Signal.ABORT, False) if abort_signal: status = Status.ABORTED if end_time == 0: # This can happen only if all jobs have an empty history. timestamp = abort_signal.attributes.get(Signal.TIMESTAMP_ATTR) start_time = timestamp end_time = timestamp elif (end_time == 0 or end_time == sys.maxint or (is_active and not is_scheduled_for_archive)): status = Status.RUNNING end_time = None elif failed: status = Status.FAILURE else: status = Status.SUCCESS return WorkflowInstanceData(name.workflow, name.instance, status, start_time, end_time)
def _parse_job_token_name(token_name): name = Name.from_job_token_name(token_name) if name.workflow: return name return None
def test_job_prefix(self): PREFIX = '/workflow/some_workflow/some_instance/job/' name = Name.from_job_prefix(PREFIX) self.assertEqual('some_workflow', name.workflow) self.assertEqual('some_instance', name.instance) self.assertEqual(PREFIX, name.get_job_prefix())
def test_workflow_prefix(self): PREFIX = '/workflow/some_workflow/' name = Name.from_workflow_prefix(PREFIX) self.assertEqual('some_workflow', name.workflow) self.assertEqual(PREFIX, name.get_workflow_prefix())