Exemplo n.º 1
0
 def is_failed(self, store):
     data_builder = DataBuilder(store, use_cache=True)
     workflow_data = data_builder.get_workflow(self.workflow)
     if not workflow_data:
         return False
     return (workflow_data.status != Status.RUNNING and
             workflow_data.status != Status.SUCCESS)
Exemplo n.º 2
0
def _compute_workflow(dbstore):
    """Cache thread's target callable that computes the workflow.

    This runnable is called my thread's run() method when thread
    starts. It will compute workflows data, serialize it, and store it
    in _WORKFLOW_JSON. This computation will infinitely
    repeat itself, constantly updating the _WORKFLOW_JSON until pinball_ui
    server stops.

    Args:
        dbstore: The store to retrieve runs status.
    """
    global _WORKFLOWS_JSON
    data_builder = DataBuilder(dbstore, use_cache=True)
    while True:
        try:
            LOG.info("Workflow data computation starting.")
            workflows_data = data_builder.get_workflows()
            schedules_data = data_builder.get_schedules()
            _WORKFLOWS_JSON = _serialize(workflows_data, schedules_data)
            LOG.info("Workflow data computation complete.")
            # TODO(mao): Tune this parameter depending on future
            # pinball user experience.
            # TODO(mao): Make this computation run at scheduled time intervals
            # and cancel the next execution if the previous job hasn't
            # finished.
            time.sleep(60 * 20)
        except Exception as e:
            LOG.exception(e)
Exemplo n.º 3
0
 def get_context_data(self, **kwargs):
     context = super(TokenView, self).get_context_data(**kwargs)
     token_name = self.request.GET['path']
     data_builder = DataBuilder(DbStore())
     token_data = data_builder.get_token(token_name)
     token_format = token_data.format()
     for key, value in token_format.items():
         context[key] = value
     return context
Exemplo n.º 4
0
 def get_context_data(self, **kwargs):
     context = super(ScheduleView, self).get_context_data(**kwargs)
     workflow = self.request.GET['workflow']
     data_builder = DataBuilder(DbStore())
     schedule_data = data_builder.get_schedule(workflow)
     formatted_schedule = schedule_data.format()
     for key, value in formatted_schedule.items():
         context[key] = value
     context['emails'] = ' '.join(schedule_data.emails)
     return context
Exemplo n.º 5
0
def schedules(_):
    try:
        data_builder = DataBuilder(DbStore())
        schedules_data = data_builder.get_schedules()
        schedules_json = _serialize(schedules_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(schedules_json, mimetype='application/json')
Exemplo n.º 6
0
 def get_context_data(self, **kwargs):
     context = super(ScheduleView, self).get_context_data(**kwargs)
     workflow = self.request.GET['workflow']
     data_builder = DataBuilder(DbStore())
     schedule_data = data_builder.get_schedule(workflow)
     formatted_schedule = schedule_data.format()
     for key, value in formatted_schedule.items():
         context[key] = value
     context['emails'] = ' '.join(schedule_data.emails)
     return context
Exemplo n.º 7
0
def schedules(_):
    try:
        data_builder = DataBuilder(DbStore())
        schedules_data = data_builder.get_schedules()
        schedules_json = _serialize(schedules_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(schedules_json, content_type='application/json')
Exemplo n.º 8
0
def instances(request):
    try:
        workflow = request.GET['workflow']
        data_builder = DataBuilder(DbStore(), use_cache=True)
        instances_data = data_builder.get_instances(workflow)
        instances_json = _serialize(instances_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(instances_json, mimetype='application/json')
Exemplo n.º 9
0
def token_paths(request):
    try:
        path = request.GET['path']
        data_builder = DataBuilder(DbStore())
        tokens_data = data_builder.get_token_paths(path)
        tokens_json = _serialize(tokens_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(tokens_json, mimetype='application/json')
Exemplo n.º 10
0
def token_paths(request):
    try:
        path = request.GET['path']
        data_builder = DataBuilder(DbStore())
        tokens_data = data_builder.get_token_paths(path)
        tokens_json = _serialize(tokens_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(tokens_json, mimetype='application/json')
Exemplo n.º 11
0
def instances(request):
    try:
        workflow = request.GET['workflow']
        data_builder = DataBuilder(DbStore(), use_cache=True)
        instances_data = data_builder.get_instances(workflow)
        instances_json = _serialize(instances_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(instances_json, mimetype='application/json')
Exemplo n.º 12
0
 def __init__(self, client, store, emailer):
     self._client = client
     self._emailer = emailer
     self._data_builder = DataBuilder(store)
     self._owned_job_token = None
     self._name = get_unique_name()
     self._inspector = Inspector(client)
     # The lock synchronizes access to shared attributes between the worker
     # thread and the lease renewer thread.
     self._lock = threading.Lock()
     self._lease_renewer = None
     self._executor = None
     self._test_only_end_if_no_runnable = False
Exemplo n.º 13
0
def jobs(request):
    try:
        data_builder = DataBuilder(DbStore(), use_cache=True)
        workflow = request.GET['workflow']
        instance = request.GET['instance']
        if instance == 'latest':
            instance = data_builder.get_latest_instance(workflow).instance
        jobs_data = data_builder.get_jobs(workflow, instance)
        jobs_json = _serialize(jobs_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(jobs_json, mimetype='application/json')
Exemplo n.º 14
0
def jobs(request):
    try:
        data_builder = DataBuilder(DbStore(), use_cache=True)
        workflow = request.GET['workflow']
        instance = request.GET['instance']
        if instance == 'latest':
            instance = data_builder.get_latest_instance(workflow).instance
        jobs_data = data_builder.get_jobs(workflow, instance)
        jobs_json = _serialize(jobs_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(jobs_json, mimetype='application/json')
Exemplo n.º 15
0
    def _get_running_instances(self, store):
        """Find running instances of the workflow.

        Args:
            store: The store to query for wokflow instance status.
        Returns:
            List of running workflow instance names.
        """
        data_builder = DataBuilder(store, use_cache=True)
        instances = data_builder.get_instances(self.workflow)
        result = []
        for instance in instances:
            if instance.status == Status.RUNNING:
                result.append(instance.instance)
        return result
Exemplo n.º 16
0
    def _get_running_instances(self, store):
        """Find running instances of the workflow.

        Args:
            store: The store to query for wokflow instance status.
        Returns:
            List of running workflow instance names.
        """
        data_builder = DataBuilder(store, use_cache=True)
        instances = data_builder.get_instances(self.workflow)
        result = []
        for instance in instances:
            if instance.status == Status.RUNNING:
                result.append(instance.instance)
        return result
Exemplo n.º 17
0
def file_content(request):
    try:
        workflow = request.GET['workflow']
        instance = request.GET['instance']
        job = request.GET['job']
        execution = int(request.GET['execution'])
        log_type = request.GET['log_type']
        if execution < 0:
            return HttpResponseServerError(
                'execution must not be negative; got ' + execution)
        data_builder = DataBuilder(DbStore())
        file_data = data_builder.get_file_content(workflow, instance, job,
                                                  execution, log_type)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(file_data, mimetype='text/plain')
Exemplo n.º 18
0
def executions(request):
    try:
        workflow = request.GET['workflow']
        instance = request.GET.get('instance')
        job = request.GET['job']
        data_builder = DataBuilder(DbStore())
        if instance:
            executions_data = data_builder.get_executions(
                workflow, instance, job)
        else:
            executions_data = data_builder.get_executions_across_instances(
                workflow, job)
        executions_json = _serialize(executions_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(executions_json, content_type='application/json')
Exemplo n.º 19
0
def file_content(request):
    try:
        workflow = request.GET['workflow']
        instance = request.GET['instance']
        job = request.GET['job']
        execution = int(request.GET['execution'])
        log_type = request.GET['log_type']
        if execution < 0:
            return HttpResponseServerError(
                'execution must not be negative; got ' + execution)
        data_builder = DataBuilder(DbStore())
        file_data = data_builder.get_file_content(workflow, instance, job,
                                                  execution, log_type)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(file_data, mimetype='text/plain')
Exemplo n.º 20
0
def executions(request):
    try:
        workflow = request.GET['workflow']
        instance = request.GET.get('instance')
        job = request.GET['job']
        data_builder = DataBuilder(DbStore())
        if instance:
            executions_data = data_builder.get_executions(workflow,
                                                          instance,
                                                          job)
        else:
            executions_data = data_builder.get_executions_across_instances(
                workflow, job)
        executions_json = _serialize(executions_data)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(executions_json, mimetype='application/json')
Exemplo n.º 21
0
def graph(request):
    try:
        data_builder = DataBuilder(DbStore(), use_cache=True)
        workflow = request.GET['workflow']
        if 'instance' in request.GET:
            instance = request.GET['instance']
            if instance == 'latest':
                instance = data_builder.get_latest_instance(workflow).instance
            jobs_data = data_builder.get_jobs(workflow=workflow,
                                              instance=instance)
            instance_data = data_builder.get_instance(workflow=workflow,
                                                      instance=instance)
            workflow_graph = WorkflowGraph(jobs_data, instance_data)
        else:
            workflow_graph = WorkflowGraph.from_parser(workflow)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(workflow_graph.get_svg(), mimetype='image/svg+xml')
Exemplo n.º 22
0
def graph(request):
    try:
        data_builder = DataBuilder(DbStore(), use_cache=True)
        workflow = request.GET['workflow']
        if 'instance' in request.GET:
            instance = request.GET['instance']
            if instance == 'latest':
                instance = data_builder.get_latest_instance(workflow).instance
            jobs_data = data_builder.get_jobs(workflow=workflow,
                                              instance=instance)
            instance_data = data_builder.get_instance(workflow=workflow,
                                                      instance=instance)
            workflow_graph = WorkflowGraph(jobs_data, instance_data)
        else:
            workflow_graph = WorkflowGraph.from_parser(workflow)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(workflow_graph.get_svg(), mimetype='image/svg+xml')
Exemplo n.º 23
0
def status(request):
    try:
        workflow = request.GET.get('workflow')
        instance = request.GET.get('instance')
        data_builder = DataBuilder(DbStore())
        status = []
        if data_builder.is_signal_set(workflow, instance, Signal.EXIT):
            status = ['exiting']
        elif data_builder.is_signal_set(workflow, instance, Signal.ABORT):
            status = ['aborting']
        elif data_builder.is_signal_set(workflow, instance, Signal.DRAIN):
            status = ['draining']
        if not _is_master_alive():
            status.append('no master at %s:%d' % (socket.gethostname(),
                                                  PinballConfig.MASTER_PORT))
        status_json = json.dumps(status)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(status_json, mimetype='application/json')
Exemplo n.º 24
0
def status(request):
    try:
        workflow = request.GET.get('workflow')
        instance = request.GET.get('instance')
        data_builder = DataBuilder(DbStore())
        status = []
        if data_builder.is_signal_set(workflow, instance, Signal.EXIT):
            status = ['exiting']
        elif data_builder.is_signal_set(workflow, instance, Signal.ABORT):
            status = ['aborting']
        elif data_builder.is_signal_set(workflow, instance, Signal.DRAIN):
            status = ['draining']
        if not _is_master_alive():
            status.append('no master at %s:%d' % (socket.gethostname(),
                                                  PinballConfig.MASTER_PORT))
        status_json = json.dumps(status)
    except:
        LOG.exception('')
        return HttpResponseServerError(traceback.format_exc())
    else:
        return HttpResponse(status_json, mimetype='application/json')
Exemplo n.º 25
0
 def get_context_data(self, **kwargs):
     context = super(ExecutionView, self).get_context_data(**kwargs)
     workflow = self.request.GET['workflow']
     instance = self.request.GET['instance']
     job = self.request.GET['job']
     execution = int(self.request.GET['execution'])
     data_builder = DataBuilder(DbStore())
     execution_data = data_builder.get_execution(workflow, instance, job,
                                                 execution)
     formatted_data = execution_data.format()
     for key, value in formatted_data.items():
         context[key] = value
     properties = []
     for key, value in execution_data.properties.items():
         properties.append('%s=%s' % (key, value))
     context['properties'] = ', '.join(properties)
     if not execution_data.end_time:
         context['end_time'] = ''
     if execution_data.exit_code is None:
         context['exit_code'] = ''
     return context
Exemplo n.º 26
0
 def __init__(self, client, store, emailer):
     self._client = client
     self._emailer = emailer
     self._data_builder = DataBuilder(store)
     self._owned_job_token = None
     self._name = get_unique_name()
     self._inspector = Inspector(client)
     # The lock synchronizes access to shared attributes between the worker
     # thread and the lease renewer thread.
     self._lock = threading.Lock()
     self._lease_renewer = None
     self._executor = None
     self._test_only_end_if_no_runnable = False
Exemplo n.º 27
0
 def get_context_data(self, **kwargs):
     context = super(ExecutionView, self).get_context_data(**kwargs)
     workflow = self.request.GET['workflow']
     instance = self.request.GET['instance']
     job = self.request.GET['job']
     execution = int(self.request.GET['execution'])
     data_builder = DataBuilder(DbStore())
     execution_data = data_builder.get_execution(workflow,
                                                 instance,
                                                 job,
                                                 execution)
     formatted_data = execution_data.format()
     for key, value in formatted_data.items():
         context[key] = value
     properties = []
     for key, value in execution_data.properties.items():
         properties.append('%s=%s' % (key, value))
     context['properties'] = ', '.join(properties)
     if not execution_data.end_time:
         context['end_time'] = ''
     if execution_data.exit_code is None:
         context['exit_code'] = ''
     return context
Exemplo n.º 28
0
 def test_workflow_data_from_instances_data4(self):
     wf_instance_list = [
         WorkflowInstanceData('wf', '22346', Status.ABORTED, 12345, 12392),
         WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12346, 12393),
         WorkflowInstanceData('wf', '22345', Status.FAILURE, 12391, sys.maxint),
         ]
     wf_data = DataBuilder._workflow_data_from_instances_data(
         wf_instance_list)
     self.assertEquals(wf_data.workflow, 'wf')
     self.assertEquals(wf_data.status, Status.SUCCESS)
     self.assertEquals(wf_data.last_instance, '22347')
     self.assertEquals(wf_data.last_start_time, 12346)
     self.assertEquals(wf_data.last_end_time, 12393)
     self.assertEquals(wf_data.running_instance_number, 0)
Exemplo n.º 29
0
 def test_workflow_data_from_instances_data2(self):
     wf_instance_list = [
         WorkflowInstanceData('wf', '22346', Status.ABORTED, 12355, sys.maxint),
         WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12365, 12390),
         WorkflowInstanceData('wf', '22345', Status.RUNNING, 12345, None),
     ]
     wf_data = DataBuilder._workflow_data_from_instances_data(
         wf_instance_list)
     self.assertEquals(wf_data.workflow, 'wf')
     self.assertEquals(wf_data.status, Status.RUNNING)
     self.assertEquals(wf_data.last_instance, '22345')
     self.assertEquals(wf_data.last_start_time, 12345)
     self.assertEquals(wf_data.last_end_time, None)
     self.assertEquals(wf_data.running_instance_number, 1)
Exemplo n.º 30
0
 def is_running(self, store):
     data_builder = DataBuilder(store, use_cache=True)
     workflow_data = data_builder.get_workflow(self.workflow)
     if not workflow_data:
         return False
     return workflow_data.status == Status.RUNNING
Exemplo n.º 31
0
class Worker(object):
    # Worker renews the ownership of the job token it owns every so often.
    _LEASE_TIME_SEC = 20 * 60  # 20 minutes

    # Delay between subsequent queries to the master.
    _INTER_QUERY_DELAY_SEC = 5

    def __init__(self, client, store, emailer):
        self._client = client
        self._emailer = emailer
        self._data_builder = DataBuilder(store)
        self._owned_job_token = None
        self._name = get_unique_name()
        self._inspector = Inspector(client)
        # The lock synchronizes access to shared attributes between the worker
        # thread and the lease renewer thread.
        self._lock = threading.Lock()
        self._lease_renewer = None
        self._executor = None
        self._test_only_end_if_no_runnable = False

    @staticmethod
    def _get_triggering_events(inputs):
        """Get a list of triggering events.

        Args:
            inputs: A list of lists where the elements of the outer list
                represent inputs of a job, while the elements of inner lists
                are names of events in those inputs.

        Returns:
            A list of event tokens, one per input, that may be used to trigger
            the job.  If any of the inputs has no events in it, the result list
            will be empty.

        Example:
            inputs = [[token('/workflows/wf/events/j/i1/e1'),
                       token('/workflows/wf/events/j/i1/e2')].
                      [token('/workflows/wf/events/j/i2/e3')]]
            return: [token('/workflows/wf/events/j/i1/e1'),
                     token('/workflows/wf/events/j/i2/e3')]

            inputs = [[token('/workflows/wf/events/j/i1/e1'),
                       token('/workflows/wf/events/j/i1/e2')].
                      []]
            return: []
        """
        triggering_events = []
        for events in inputs:
            if not events:
                return []
            triggering_events.append(events[0])
        return triggering_events

    def _move_job_token_to_runnable(self, job_token, triggering_event_tokens):
        """Move a job token to the runnable branch of the token tree.

        Token tree is the global, hierarchically structured token namespace.
        Args:
            job_token: The job token to make runnable.
            triggering_event_tokens: The list of events used to trigger the
                job.  These events will be removed from the master in the same
                call to that makes the job token runnable.
        Returns:
            True on success, otherwise False.
        """
        name = Name.from_job_token_name(job_token.name)
        name.job_state = Name.RUNNABLE_STATE
        job = pickle.loads(job_token.data)
        Worker._add_events_to_job(job, triggering_event_tokens)
        runnable_job_token = Token(name=name.get_job_token_name(),
                                   priority=job_token.priority,
                                   data=pickle.dumps(job))
        request = ModifyRequest(updates=[runnable_job_token],
                                deletes=triggering_event_tokens + [job_token])
        return self._send_request(request)

    @staticmethod
    def _add_events_to_job(job, triggering_event_tokens):
        """Put triggering events inside the job.

        Args:
            job: The job which should be augmented with the events.
            triggering_event_tokens: List of event tokens that triggered the
                job.
        """
        assert not job.events
        for event_token in triggering_event_tokens:
            if event_token.data:
                event = pickle.loads(event_token.data)
                # Optimization to make the job data structure smaller: do not
                # append events with no attributes.
                if event.attributes:
                    job.events.append(event)
            else:
                # This logic is here for backwards compatibility.
                # TODO(pawel): remove this logic after the transition to the
                # new model has been completed.
                name = Name.from_event_token_name(event_token.name)
                assert name.input == Name.WORKFLOW_START_INPUT

    def _make_job_runnable(self, job_token):
        """Attempt to make a job runnable.

        Query event tokens in job inputs.  If a combination of triggering
        events exist, remove those events and make the job runnable.
        Otherwise, do nothing.

        Args:
            job_token: The job token to make runnable.
        Returns:
            True if there were no errors during communication with the master,
            otherwise False.
        """
        job = pickle.loads(job_token.data)
        name = Name.from_job_token_name(job_token.name)
        request = QueryRequest(queries=[])
        # TODO(pawel): handle jobs with no dependencies
        assert job.inputs
        for input_name in job.inputs:
            prefix = Name()
            prefix.workflow = name.workflow
            prefix.instance = name.instance
            prefix.job = name.job
            prefix.input = input_name
            query = Query()
            query.namePrefix = prefix.get_input_prefix()
            query.maxTokens = 1
            request.queries.append(query)
        try:
            response = self._client.query(request)
        except TokenMasterException:
            # TODO(pawel): add a retry count and fail if a limit is reached.
            LOG.exception('error sending request %s', request)
            return False
        triggering_events = Worker._get_triggering_events(response.tokens)
        if triggering_events:
            return self._move_job_token_to_runnable(job_token,
                                                    triggering_events)
        return True

    def _make_runnable(self, workflow, instance):
        """Attempt to make jobs in a given workflow instance runnable.

        Go over all waiting jobs in a given workflow instance and try to make
        them runnable.

        Args:
            workflow: The name of the workflow whose jobs should be considered.
            instance: The workflow instance whose jobs should be considered.
        Returns:
            True if there were no errors during communication with the master,
            otherwise False.
        """
        name = Name()
        name.workflow = workflow
        name.instance = instance
        name.job_state = Name.WAITING_STATE
        query = Query(namePrefix=name.get_job_state_prefix())
        # TODO(pawel): to prevent multiple workers from trying to make the
        # same job runnable at the same time, this should be a
        # QueryAndOwnRequest.  Note that the current implementation is correct,
        # just inefficient.
        request = QueryRequest(queries=[query])
        try:
            response = self._client.query(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.tokens) == 1
        for token in response.tokens[0]:
            if not self._make_job_runnable(token):
                return False
        return True

    def _has_no_runnable_jobs(self, workflow, instance):
        """Check if the workflow instance does not contain runnable jobs.

        Returns:
            True if we are certain that the workflow has no runnable jobs.
            Otherwise False.  If there were any errors during communication
            with the master, the return value is False.
        """
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.RUNNABLE_STATE)
        query = Query(namePrefix=name.get_job_state_prefix())
        request = QueryRequest(queries=[query])
        try:
            response = self._client.query(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.tokens) == 1
        if response.tokens[0]:
            return False
        return True

    def _is_done(self, workflow, instance):
        """Check if the workflow instance is done.

        A workflow is done if it does not have runnable jobs.

        Returns:
            True if we are certain that the workflow is not running.  Otherwise
            False.  If there were any errors during communication with the
            master, the return value is False.
        """
        # Attempt to make the workflow runnable and verify that no WAITING job
        # tokens were changed in the meantime.
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.WAITING_STATE)
        query = Query(namePrefix=name.get_job_state_prefix())
        request = QueryRequest(queries=[query])
        try:
            snapshot = Snapshot(self._client, request)
        except:
            LOG.exception('error sending request %s', request)
            return False
        if not self._make_runnable(workflow, instance):
            return False
        if not self._has_no_runnable_jobs(workflow, instance):
            return False
        try:
            return not snapshot.refresh()
        except:
            LOG.exception('error sending request %s', request)
            return False

    def _process_signals(self, workflow, instance):
        """Process signals for a given workflow instance.

        Args:
            workflow: The workflow whose signals should be processed.
            instance: The instance whose signals should be processed.
        Returns:
            True if the worker should execute jobs in this instance.  Otherwise
            False.
        """
        signaller = Signaller(self._client, workflow, instance)
        archiver = Archiver(self._client, workflow, instance)
        if signaller.is_action_set(Signal.EXIT):
            return False
        if (signaller.is_action_set(Signal.ARCHIVE) and
                self._is_done(workflow, instance)):
            # TODO(pawel): enable this for all workflows after we gain
            # confidence that the master has enough memory to delay workflow
            # archiving.
            if workflow == 'indexing':
                ARCHIVE_DELAY_SEC = 7 * 24 * 60 * 60  # 7 days
            else:
                ARCHIVE_DELAY_SEC = 12 * 60 * 60  # 12 hours
            expiration_timestamp = int(time.time()) + ARCHIVE_DELAY_SEC
            if signaller.set_attribute_if_missing(Signal.ARCHIVE,
                                                  Signal.TIMESTAMP_ATTR,
                                                  expiration_timestamp):
                self._send_instance_end_email(workflow, instance)
            else:
                expiration_timestamp = signaller.get_attribute(
                    Signal.ARCHIVE, Signal.TIMESTAMP_ATTR)
                archiver.archive_if_expired(expiration_timestamp)
            return False
        if signaller.is_action_set(Signal.ABORT):
            if archiver.archive_if_aborted():
                self._send_instance_end_email(workflow, instance)
            return False
        if signaller.is_action_set(Signal.DRAIN):
            return False
        return True

    def _query_and_own_runnable_job_token(self, workflow, instance):
        """Attempt to own a runnable job token from a given workflow instance.

        Try to own a runnable job token in a given workflow instance.  The
        ownership of the qualifying job token lasts for a limited time so it
        has to be periodically renewed.

        Args:
            workflow: The name of the workflow whose jobs should be considered.
            instance: The workflow instance whose jobs should be considered.
        """
        assert not self._owned_job_token
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.RUNNABLE_STATE)
        query = Query()
        query.namePrefix = name.get_job_state_prefix()
        query.maxTokens = 1
        request = QueryAndOwnRequest()
        request.query = query
        request.expirationTime = time.time() + Worker._LEASE_TIME_SEC
        request.owner = self._name
        try:
            response = self._client.query_and_own(request)
            if response.tokens:
                assert len(response.tokens) == 1
                self._owned_job_token = response.tokens[0]
        except TokenMasterException:
            LOG.exception('error sending request %s', request)

    def _own_runnable_job_token(self):
        """Attempt to own a runnable job token from any workflow."""
        assert not self._owned_job_token
        workflow_names = self._inspector.get_workflow_names()
        # Shuffle workflows to address starvation.
        random.shuffle(workflow_names)
        for workflow in workflow_names:
            instances = self._inspector.get_workflow_instances(workflow)
            time.sleep(Worker._INTER_QUERY_DELAY_SEC)
            random.shuffle(instances)
            for instance in instances:
                if self._process_signals(workflow, instance):
                    self._make_runnable(workflow, instance)
                    self._query_and_own_runnable_job_token(workflow, instance)
                    if self._owned_job_token:
                        return
            time.sleep(Worker._INTER_QUERY_DELAY_SEC)

    def _abort(self):
        """Abort the running job."""
        assert self._executor
        self._executor.abort()

    def _process_abort_signals(self):
        """Check if the running job should be aborted.

        Returns:
            False iff the job has been aborted.
        """
        name = Name.from_job_token_name(self._owned_job_token.name)
        abort = False
        try:
            signaller = Signaller(self._client, name.workflow, name.instance)
            abort = signaller.is_action_set(Signal.ABORT)
        except (TTransport.TTransportException, socket.timeout, socket.error):
            # We need this exception handler only in logic located in the
            # Timer thread.  If that thread fails, we should abort the process
            # and let the main thread decide what to do.
            LOG.exception('')
            abort = True
        if abort:
            self._abort()
        return not abort

    def _refresh_job_properties(self):
        """Record job properties in the master if they changed.

        If there are communication issues with the master, the running job
        gets aborted.

        Returns:
            False iff there was an error during communication with the master.
        """
        assert self._executor
        if self._executor.job_dirty:
            # The ordering here is important - we need to reset the changed
            # flag before updating the token.
            self._executor.job_dirty = False
            self._owned_job_token.data = pickle.dumps(self._executor.job)
            if not self._update_owned_job_token():
                self._abort()
                return False
        return True

    def _renew_ownership(self):
        """Periodic job token ownership renewal routine."""
        assert self._owned_job_token

        if not self._process_abort_signals():
            return

        if not self._refresh_job_properties():
            return

        now = time.time()
        if (self._owned_job_token.expirationTime <
                now + Worker._LEASE_TIME_SEC / 2):
            self._owned_job_token.expirationTime = (now +
                                                    Worker._LEASE_TIME_SEC)
            if not self._update_owned_job_token():
                self._abort()
                return

        with self._lock:
            if self._lease_renewer:
                self._lease_renewer = threading.Timer(
                    Worker._randomized_worker_polling_time(),
                    self._renew_ownership)
                self._lease_renewer.start()

    def _start_renew_ownership(self):
        """Start periodic renewal of the claimed job token ownership."""
        assert not self._lease_renewer
        self._lease_renewer = threading.Timer(
            Worker._randomized_worker_polling_time(),
            self._renew_ownership)
        self._lease_renewer.start()

    def _stop_renew_ownership(self):
        """Stop periodic renewal of the claimed job token ownership."""
        with self._lock:
            assert self._lease_renewer
            self._lease_renewer.cancel()
            lease_renewer = self._lease_renewer
            self._lease_renewer = None
        lease_renewer.join()

    def _send_request(self, request):
        """Send a modify request to the master.

        Args:
            request: The modify request to send.
        Returns:
            True on success, otherwise False.
        """
        try:
            self._client.modify(request)
            return True
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False

    def _get_output_event_tokens(self, job):
        """Create output event tokens for the owned job token.

        Args:
            job: The job which output tokens should be generated.
        Returns:
            A list of event tokens corresponding to the outputs of the owned
            job token.
        """
        assert self._owned_job_token
        job_name = Name.from_job_token_name(self._owned_job_token.name)
        output_name = Name()
        output_name.workflow = job_name.workflow
        output_name.instance = job_name.instance
        output_name.input = job_name.job
        event_tokens = []
        for output in job.outputs:
            output_name.job = output
            output_name.event = get_unique_name()
            event = Event(creator=self._name)
            assert job.history
            execution_record = job.history[-1]
            event.attributes = execution_record.get_event_attributes()
            event_tokens.append(Token(name=output_name.get_event_token_name(),
                                      data=pickle.dumps(event)))
        return event_tokens

    def _move_job_token_to_waiting(self, job, succeeded):
        """Move the owned job token to the waiting group.

        If the job succeeded, also post events to job outputs.  If the job
        failed or it is the final job (a job with no outputs),  post an archive
        signal to finish the workflow.

        Args:
            job: The job that should be stored in the data field of the waiting
                job token.
            succeeded: True if the job succeeded, otherwise False.
        """
        assert self._owned_job_token
        name = Name.from_job_token_name(self._owned_job_token.name)
        name.job_state = Name.WAITING_STATE
        waiting_job_token = Token(name=name.get_job_token_name(),
                                  priority=self._owned_job_token.priority,
                                  data=pickle.dumps(job))
        request = ModifyRequest(deletes=[self._owned_job_token],
                                updates=[waiting_job_token])
        if succeeded:
            request.updates.extend(self._get_output_event_tokens(job))
        if not job.outputs or not succeeded:
            # This is either the only job in the workflow with no outputs or a
            # failed job.  In either case, the workflow is done.
            signaller = Signaller(self._client,
                                  workflow=name.workflow,
                                  instance=name.instance)
            if not signaller.is_action_set(Signal.ARCHIVE):
                signal_name = Name(
                    workflow=name.workflow,
                    instance=name.instance,
                    signal=Signal.action_to_string(Signal.ARCHIVE))
                signal = Signal(Signal.ARCHIVE)
                signal_token = Token(name=signal_name.get_signal_token_name())
                signal_token.data = pickle.dumps(signal)
                request.updates.append(signal_token)
        self._send_request(request)

    def _unown(self, token):
        """Reset the ownership of a token.

        Args:
            token: The token whose ownership should be reset.
        """
        token.owner = None
        token.expirationTime = None

    def _keep_job_token_in_runnable(self, job):
        """Keep the owned job token in the runnable group.

        Refresh the job token data field with the provided job object, release
        the ownership of the token, and return it to the runnable group.

        Args:
            job: The job that should be stored in the data field of the job
                token.
        """
        assert self._owned_job_token
        request = ModifyRequest()
        self._owned_job_token.data = pickle.dumps(job)
        retry_delay_sec = job.retry_delay_sec
        if retry_delay_sec > 0:
            self._owned_job_token.expirationTime = (time.time() +
                                                    retry_delay_sec)
        else:
            self._unown(self._owned_job_token)
        request.updates = [self._owned_job_token]
        self._send_request(request)

    def _update_owned_job_token(self):
        """Update owned job token in the master.

        Returns:
            True if the update was successful, otherwise False.
        """
        assert self._owned_job_token
        request = ModifyRequest()
        request.updates = [self._owned_job_token]
        try:
            response = self._client.modify(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.updates) == 1
        self._owned_job_token = response.updates[0]
        return True

    def _execute_job(self):
        """Execute the owned job."""
        assert self._owned_job_token
        job = pickle.loads(self._owned_job_token.data)
        name = Name.from_job_token_name(self._owned_job_token.name)
        self._executor = JobExecutor.from_job(name.workflow,
                                              name.instance,
                                              name.job,
                                              job,
                                              self._data_builder,
                                              self._emailer)
        success = self._executor.prepare()
        if success:
            self._owned_job_token.data = pickle.dumps(self._executor.job)
            success = self._update_owned_job_token()
            if success:
                self._start_renew_ownership()
                success = self._executor.execute()
                self._stop_renew_ownership()
        if success:
            self._move_job_token_to_waiting(self._executor.job, True)
        elif self._executor.job.retry():
            self._keep_job_token_in_runnable(self._executor.job)
        else:
            signaller = Signaller(self._client, name.workflow, name.instance)
            # If ARCHIVE is not set, this is the first failed job in the
            # workflow.
            first_failure = not signaller.is_action_set(Signal.ARCHIVE)
            self._move_job_token_to_waiting(self._executor.job, False)
            self._send_job_failure_emails(first_failure)
        self._executor = None
        self._owned_job_token = None
        # If needed, archive the workflow.
        self._process_signals(name.workflow, name.instance)

    def _send_instance_end_email(self, workflow, instance):
        try:
            schedule_data = self._data_builder.get_schedule(workflow)
            if not schedule_data:
                LOG.warning('no schedule found for workflow %s', workflow)
            elif schedule_data.emails:
                instance_data = self._data_builder.get_instance(workflow,
                                                                instance)
                jobs_data = self._data_builder.get_jobs(workflow, instance)
                self._emailer.send_instance_end_message(schedule_data.emails,
                                                        instance_data,
                                                        jobs_data)
        except:
            LOG.exception('error sending instance end email for workflow %s '
                          'instance %s', workflow, instance)

    def _send_job_failure_emails(self, first_failure):
        assert self._owned_job_token
        name = Name.from_job_token_name(self._owned_job_token.name)
        job = self._executor.job
        emails = set(job.emails)
        if first_failure:
            schedule_data = self._data_builder.get_schedule(name.workflow)
            if schedule_data:
                emails.update(schedule_data.emails)
            else:
                LOG.warning('no schedule found for workflow %s', name.workflow)
        if emails:
            execution = len(job.history) - 1
            job_execution_data = self._data_builder.get_execution(
                name.workflow, name.instance, name.job, execution)
            try:
                self._emailer.send_job_execution_end_message(
                    list(emails), job_execution_data)
            except:
                LOG.exception('error sending job failure email for '
                              'workflow %s instance %s job %s execution %d',
                              name.workflow,
                              name.instance,
                              name.job,
                              execution)

    @staticmethod
    def _randomized_worker_polling_time():
        """Generate random worker polling time."""
        return (1.0 + random.random()) * PinballConfig.WORKER_POLL_TIME_SEC

    def run(self):
        """Run the worker."""
        LOG.info('Running worker ' + self._name)
        while True:
            signaller = Signaller(self._client)
            if signaller.is_action_set(Signal.EXIT):
                return
            if not signaller.is_action_set(Signal.DRAIN):
                self._own_runnable_job_token()
            if self._owned_job_token:
                self._execute_job()
            elif self._test_only_end_if_no_runnable:
                return
            else:
                time.sleep(Worker._randomized_worker_polling_time())
        LOG.info('Exiting worker ' + self._name)
Exemplo n.º 32
0
 def is_running(self, store):
     data_builder = DataBuilder(store, use_cache=True)
     workflow_data = data_builder.get_workflow(self.workflow)
     if not workflow_data:
         return False
     return workflow_data.status == Status.RUNNING
Exemplo n.º 33
0
class DataBuilderTestCase(unittest.TestCase):
    def setUp(self):
        self._store = EphemeralStore()
        self._data_builder = DataBuilder(self._store)

    @mock.patch('os.makedirs')
    @mock.patch('__builtin__.open')
    def _add_tokens(self, _, __):
        generate_workflows(2, 2, 2, 2, 2, self._store)

    def test_get_workflows_empty(self):
        self.assertEqual([], self._data_builder.get_workflows())

    def _get_workflows(self):
        self._add_tokens()
        workflows = self._data_builder.get_workflows()
        self.assertEqual(4, len(workflows))
        workflow_status = {'workflow_0': Status.RUNNING,
                           'workflow_1': Status.RUNNING,
                           'workflow_2': Status.SUCCESS,
                           'workflow_3': Status.FAILURE}
        for workflow in workflows:
            self.assertEqual(workflow_status[workflow.workflow],
                             workflow.status)
            self.assertEqual('instance_1', workflow.last_instance)
            del workflow_status[workflow.workflow]
        self.assertEqual({}, workflow_status)

    def test_get_workflows(self):
        self._get_workflows()

    def test_get_workflows_using_cache(self):
        self._data_builder.use_cache = True
        self._get_workflows()
        # Only finished (archived) workflow instances should have been cached.
        expected_cached_names = ['/workflow/workflow_2/instance_0/',
                                 '/workflow/workflow_2/instance_1/',
                                 '/workflow/workflow_3/instance_0/',
                                 '/workflow/workflow_3/instance_1/']
        cached_names = sorted(self._store.read_cached_data_names())
        self.assertEqual(expected_cached_names, cached_names)

    def test_get_workflow_empty(self):
        self.assertIsNone(self._data_builder.get_workflow('does_not_exist'))

    def _get_workflow(self):
        self._add_tokens()
        workflow = self._data_builder.get_workflow('workflow_0')
        self.assertEqual('workflow_0', workflow.workflow)
        self.assertEqual(Status.RUNNING, workflow.status)
        self.assertEqual('instance_1', workflow.last_instance)

    def test_get_workflow(self):
        self._get_workflow()

    def test_get_workflow_using_cache(self):
        self._data_builder.use_cache = True
        self._get_workflow()
        # Instances of a running workflow should not have been cached.
        self.assertEqual([], self._store.read_cached_data_names())

    def test_get_instances_empty(self):
        self.assertEqual([],
                         self._data_builder.get_instances('does_not_exist'))

    def _get_instances(self):
        self._add_tokens()
        instances = self._data_builder.get_instances('workflow_2')
        self.assertEqual(2, len(instances))
        instance_status = [Status.SUCCESS, Status.FAILURE]
        for instance in instances:
            self.assertEqual('workflow_2', instance.workflow)
            instance_status.remove(instance.status)
        self.assertEqual([], instance_status)

    def test_get_instances(self):
        self._get_instances()

    def test_get_instances_using_cache(self):
        self._data_builder.use_cache = True
        self._get_instances()
        expected_cached_names = ['/workflow/workflow_2/instance_0/',
                                 '/workflow/workflow_2/instance_1/']
        cached_names = sorted(self._store.read_cached_data_names())
        self.assertEqual(expected_cached_names, cached_names)

    def test_get_instance_empty(self):
        self.assertIsNone(None,
                          self._data_builder.get_instance('does_not_exist',
                                                          'instance_0'))

    def _get_instance(self):
        self._add_tokens()
        instance = self._data_builder.get_instance('workflow_0', 'instance_0')
        self.assertEqual('workflow_0', instance.workflow)
        self.assertEqual('instance_0', instance.instance)

    def test_get_instance(self):
        self._get_instance()

    def test_get_instance_using_cache(self):
        self._data_builder.use_cache = True
        self._get_instance()
        # Running instance should not have been cached.
        self.assertEqual([], self._store.read_cached_data_names())

    def test_get_jobs_empty(self):
        self.assertEqual([],
                         self._data_builder.get_jobs('does_not_exist',
                                                     'does_not_exist'))

    def test_get_jobs(self):
        self._add_tokens()
        jobs = self._data_builder.get_jobs('workflow_0', 'instance_0')
        self.assertEqual(2, len(jobs))
        for job in jobs:
            self.assertEqual('workflow_0', job.workflow)
            self.assertEqual('instance_0', job.instance)
            self.assertEqual('ShellJob', job.job_type)
            self.assertTrue(job.info.startswith('command=some command'))
            self.assertEqual(Status.FAILURE, job.status)
        self.assertEqual([(0, ''), (1, 'SUCCESS'), (9, 'FAILURE')],
                         jobs[0].progress)
        self.assertEqual([(89, ''), (1, 'SUCCESS'), (9, 'FAILURE')],
                         jobs[1].progress)

    def test_get_executions_empty(self):
        self.assertEqual([],
                         self._data_builder.get_executions('does_not_exist',
                                                           'does_not_exist',
                                                           'does_not_exist'))

    def test_get_executions(self):
        self._add_tokens()
        executions = self._data_builder.get_executions('workflow_0',
                                                       'instance_0',
                                                       'job_0')
        self.assertEqual(2, len(executions))
        exit_codes = [0, 1]
        for execution in executions:
            self.assertEqual('workflow_0', execution.workflow)
            self.assertEqual('instance_0', execution.instance)
            self.assertEqual('job_0', execution.job)
            self.assertTrue(execution.info.startswith('some_command'))
            exit_codes.remove(execution.exit_code)
            self.assertEqual(2, len(execution.logs))

    def test_get_executions_across_instances_empty(self):
        self.assertEqual([],
                         self._data_builder.get_executions_across_instances(
                             'does_not_exist',
                             'does_not_exist'))

    def test_get_executions_across_instances(self):
        self._add_tokens()
        executions = self._data_builder.get_executions_across_instances(
            'workflow_0', 'job_0')
        self.assertEqual(2 * 2, len(executions))
        exit_codes = [0, 0, 1, 1]
        for execution in executions:
            self.assertEqual('workflow_0', execution.workflow)
            self.assertEqual('job_0', execution.job)
            self.assertTrue(execution.info.startswith('some_command'))
            exit_codes.remove(execution.exit_code)
            self.assertEqual(2, len(execution.logs))

    def test_get_execution_empty(self):
        self.assertIsNone(self._data_builder.get_execution('does_not_exist',
                                                           'does_not_exist',
                                                           'does_not_exist',
                                                           0))

    def test_get_execution(self):
        self._add_tokens()
        execution = self._data_builder.get_execution('workflow_0',
                                                     'instance_0',
                                                     'job_0',
                                                     1)
        self.assertEqual('workflow_0', execution.workflow)
        self.assertEqual('instance_0', execution.instance)
        self.assertEqual('job_0', execution.job)
        self.assertEqual(1, execution.execution)
        self.assertEqual('some_command 1 some_args 1', execution.info)
        self.assertEqual(1, execution.exit_code)
        self.assertEqual(2, execution.start_time)
        self.assertEqual(13, execution.end_time)
        self.assertEqual(2, len(execution.logs))

    @mock.patch('__builtin__.open')
    def test_get_file_content_no_file(self, _):
        self.assertEqual('',
                         self._data_builder.get_file_content('does_not_exist',
                                                             'does_not_exist',
                                                             'does_not_exist',
                                                             'does_not_exist',
                                                             'does_not_exist'))

    @mock.patch('os.makedirs')
    @mock.patch('__builtin__.open')
    def test_get_file_content(self, open_mock, _):
        generate_workflows(2, 2, 2, 2, 2, self._store)

        file_mock = mock.MagicMock()
        open_mock.return_value = file_mock
        file_mock.__enter__.return_value = file_mock
        file_mock.read.return_value = 'some_content'

        content = self._data_builder.get_file_content('workflow_0',
                                                      'instance_0',
                                                      'job_0',
                                                      0,
                                                      'info')
        self.assertEqual('some_content', content)

    def test_get_token_paths_empty(self):
        self.assertRaises(PinballException,
                          self._data_builder.get_token_paths,
                          '')

    def test_get_token_paths(self):
        self._add_tokens()
        token_paths = self._data_builder.get_token_paths(
            '/workflow/workflow_0/instance_0/job/waiting/')
        self.assertEqual(2, len(token_paths))
        paths = ['/workflow/workflow_0/instance_0/job/waiting/job_0',
                 '/workflow/workflow_0/instance_0/job/waiting/job_1']
        for token_path in token_paths:
            self.assertEqual(1, token_path.count)
            paths.remove(token_path.path)
        self.assertEqual([], paths)

    def test_get_token_empty(self):
        self.assertRaises(PinballException,
                          self._data_builder.get_token,
                          '/does_not_exist')

    def test_get_token(self):
        self._add_tokens()
        token = self._data_builder.get_token(
            '/workflow/workflow_0/instance_0/job/waiting/job_0')
        self.assertEqual('/workflow/workflow_0/instance_0/job/waiting/job_0',
                         token.name)
        self.assertIsNone(token.owner)
        self.assertIsNone(token.expiration_time)
        self.assertEqual(0, token.priority)
        self.assertIsNotNone(token.data)

    def test_signal_not_set(self):
        self.assertFalse(self._data_builder.is_signal_set('does_not_exist', 0,
                                                          Signal.DRAIN))

    def test_signal_set(self):
        self._add_tokens()
        self.assertTrue(self._data_builder.is_signal_set('workflow_0', 0,
                                                         Signal.DRAIN))

    # Workflow status should be the running instance
    def test_workflow_data_from_instances_data1(self):
        wf_instance_list = [
            WorkflowInstanceData('wf', '22346', Status.ABORTED, 12346, 54321),
            WorkflowInstanceData('wf', '22345', Status.RUNNING, 12345, None),
            WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12347, 12390),
            WorkflowInstanceData('wf', '22348', Status.RUNNING, 12348, None),
        ]
        wf_data = DataBuilder._workflow_data_from_instances_data(
            wf_instance_list)
        self.assertEquals(wf_data.workflow, 'wf')
        self.assertEquals(wf_data.status, Status.RUNNING)
        self.assertEquals(wf_data.last_instance, '22348')
        self.assertEquals(wf_data.last_start_time, 12348)
        self.assertEquals(wf_data.last_end_time, None)
        self.assertEquals(wf_data.running_instance_number, 2)

    # Workflow status should be the running instance even if some instance ended
    # at sys.maxint time
    def test_workflow_data_from_instances_data2(self):
        wf_instance_list = [
            WorkflowInstanceData('wf', '22346', Status.ABORTED, 12355, sys.maxint),
            WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12365, 12390),
            WorkflowInstanceData('wf', '22345', Status.RUNNING, 12345, None),
        ]
        wf_data = DataBuilder._workflow_data_from_instances_data(
            wf_instance_list)
        self.assertEquals(wf_data.workflow, 'wf')
        self.assertEquals(wf_data.status, Status.RUNNING)
        self.assertEquals(wf_data.last_instance, '22345')
        self.assertEquals(wf_data.last_start_time, 12345)
        self.assertEquals(wf_data.last_end_time, None)
        self.assertEquals(wf_data.running_instance_number, 1)

    # Workflow status should be the last finished instance
    def test_workflow_data_from_instances_data3(self):
        wf_instance_list = [
            WorkflowInstanceData('wf', '22346', Status.ABORTED, 12345, 12392),
            WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12346, 12393),
            WorkflowInstanceData('wf', '22345', Status.FAILURE, 12347, 12391),
            ]
        wf_data = DataBuilder._workflow_data_from_instances_data(
            wf_instance_list)
        self.assertEquals(wf_data.workflow, 'wf')
        self.assertEquals(wf_data.status, Status.SUCCESS)
        self.assertEquals(wf_data.last_instance, '22347')
        self.assertEquals(wf_data.last_start_time, 12346)
        self.assertEquals(wf_data.last_end_time, 12393)
        self.assertEquals(wf_data.running_instance_number, 0)

    # Workflow status should be the last finished instance even if some instance
    # ended with sys.maxint time
    def test_workflow_data_from_instances_data4(self):
        wf_instance_list = [
            WorkflowInstanceData('wf', '22346', Status.ABORTED, 12345, 12392),
            WorkflowInstanceData('wf', '22347', Status.SUCCESS, 12346, 12393),
            WorkflowInstanceData('wf', '22345', Status.FAILURE, 12391, sys.maxint),
            ]
        wf_data = DataBuilder._workflow_data_from_instances_data(
            wf_instance_list)
        self.assertEquals(wf_data.workflow, 'wf')
        self.assertEquals(wf_data.status, Status.SUCCESS)
        self.assertEquals(wf_data.last_instance, '22347')
        self.assertEquals(wf_data.last_start_time, 12346)
        self.assertEquals(wf_data.last_end_time, 12393)
        self.assertEquals(wf_data.running_instance_number, 0)
Exemplo n.º 34
0
 def setUp(self):
     self._store = EphemeralStore()
     self._data_builder = DataBuilder(self._store)
Exemplo n.º 35
0
class Worker(object):
    # Worker renews the ownership of the job token it owns every so often.
    _LEASE_TIME_SEC = 20 * 60  # 20 minutes

    # Delay between subsequent queries to the master.
    _INTER_QUERY_DELAY_SEC = 5

    def __init__(self, client, store, emailer):
        self._client = client
        self._emailer = emailer
        self._data_builder = DataBuilder(store)
        self._owned_job_token = None
        self._name = get_unique_name()
        self._inspector = Inspector(client)
        # The lock synchronizes access to shared attributes between the worker
        # thread and the lease renewer thread.
        self._lock = threading.Lock()
        self._lease_renewer = None
        self._executor = None
        self._test_only_end_if_no_runnable = False

    @staticmethod
    def _get_triggering_events(inputs):
        """Get a list of triggering events.

        Args:
            inputs: A list of lists where the elements of the outer list
                represent inputs of a job, while the elements of inner lists
                are names of events in those inputs.

        Returns:
            A list of event tokens, one per input, that may be used to trigger
            the job.  If any of the inputs has no events in it, the result list
            will be empty.

        Example:
            inputs = [[token('/workflows/wf/events/j/i1/e1'),
                       token('/workflows/wf/events/j/i1/e2')].
                      [token('/workflows/wf/events/j/i2/e3')]]
            return: [token('/workflows/wf/events/j/i1/e1'),
                     token('/workflows/wf/events/j/i2/e3')]

            inputs = [[token('/workflows/wf/events/j/i1/e1'),
                       token('/workflows/wf/events/j/i1/e2')].
                      []]
            return: []
        """
        triggering_events = []
        for events in inputs:
            if not events:
                return []
            triggering_events.append(events[0])
        return triggering_events

    def _move_job_token_to_runnable(self, job_token, triggering_event_tokens):
        """Move a job token to the runnable branch of the token tree.

        Token tree is the global, hierarchically structured token namespace.
        Args:
            job_token: The job token to make runnable.
            triggering_event_tokens: The list of events used to trigger the
                job.  These events will be removed from the master in the same
                call to that makes the job token runnable.
        Returns:
            True on success, otherwise False.
        """
        name = Name.from_job_token_name(job_token.name)
        name.job_state = Name.RUNNABLE_STATE
        job = pickle.loads(job_token.data)
        Worker._add_events_to_job(job, triggering_event_tokens)
        runnable_job_token = Token(name=name.get_job_token_name(),
                                   priority=job_token.priority,
                                   data=pickle.dumps(job))
        request = ModifyRequest(updates=[runnable_job_token],
                                deletes=triggering_event_tokens + [job_token])
        return self._send_request(request)

    @staticmethod
    def _add_events_to_job(job, triggering_event_tokens):
        """Put triggering events inside the job.

        Args:
            job: The job which should be augmented with the events.
            triggering_event_tokens: List of event tokens that triggered the
                job.
        """
        assert not job.events
        for event_token in triggering_event_tokens:
            if event_token.data:
                event = pickle.loads(event_token.data)
                # Optimization to make the job data structure smaller: do not
                # append events with no attributes.
                if event.attributes:
                    job.events.append(event)
            else:
                # This logic is here for backwards compatibility.
                # TODO(pawel): remove this logic after the transition to the
                # new model has been completed.
                name = Name.from_event_token_name(event_token.name)
                assert name.input == Name.WORKFLOW_START_INPUT

    def _make_job_runnable(self, job_token):
        """Attempt to make a job runnable.

        Query event tokens in job inputs.  If a combination of triggering
        events exist, remove those events and make the job runnable.
        Otherwise, do nothing.

        Args:
            job_token: The job token to make runnable.
        Returns:
            True if there were no errors during communication with the master,
            otherwise False.
        """
        job = pickle.loads(job_token.data)
        name = Name.from_job_token_name(job_token.name)
        request = QueryRequest(queries=[])
        # TODO(pawel): handle jobs with no dependencies
        assert job.inputs
        for input_name in job.inputs:
            prefix = Name()
            prefix.workflow = name.workflow
            prefix.instance = name.instance
            prefix.job = name.job
            prefix.input = input_name
            query = Query()
            query.namePrefix = prefix.get_input_prefix()
            query.maxTokens = 1
            request.queries.append(query)
        try:
            response = self._client.query(request)
        except TokenMasterException:
            # TODO(pawel): add a retry count and fail if a limit is reached.
            LOG.exception('error sending request %s', request)
            return False
        triggering_events = Worker._get_triggering_events(response.tokens)
        if triggering_events:
            return self._move_job_token_to_runnable(job_token,
                                                    triggering_events)
        return True

    def _make_runnable(self, workflow, instance):
        """Attempt to make jobs in a given workflow instance runnable.

        Go over all waiting jobs in a given workflow instance and try to make
        them runnable.

        Args:
            workflow: The name of the workflow whose jobs should be considered.
            instance: The workflow instance whose jobs should be considered.
        Returns:
            True if there were no errors during communication with the master,
            otherwise False.
        """
        name = Name()
        name.workflow = workflow
        name.instance = instance
        name.job_state = Name.WAITING_STATE
        query = Query(namePrefix=name.get_job_state_prefix())
        # TODO(pawel): to prevent multiple workers from trying to make the
        # same job runnable at the same time, this should be a
        # QueryAndOwnRequest.  Note that the current implementation is correct,
        # just inefficient.
        request = QueryRequest(queries=[query])
        try:
            response = self._client.query(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.tokens) == 1
        for token in response.tokens[0]:
            if not self._make_job_runnable(token):
                return False
        return True

    def _has_no_runnable_jobs(self, workflow, instance):
        """Check if the workflow instance does not contain runnable jobs.

        Returns:
            True if we are certain that the workflow has no runnable jobs.
            Otherwise False.  If there were any errors during communication
            with the master, the return value is False.
        """
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.RUNNABLE_STATE)
        query = Query(namePrefix=name.get_job_state_prefix())
        request = QueryRequest(queries=[query])
        try:
            response = self._client.query(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.tokens) == 1
        if response.tokens[0]:
            return False
        return True

    def _is_done(self, workflow, instance):
        """Check if the workflow instance is done.

        A workflow is done if it does not have runnable jobs.

        Returns:
            True if we are certain that the workflow is not running.  Otherwise
            False.  If there were any errors during communication with the
            master, the return value is False.
        """
        # Attempt to make the workflow runnable and verify that no WAITING job
        # tokens were changed in the meantime.
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.WAITING_STATE)
        query = Query(namePrefix=name.get_job_state_prefix())
        request = QueryRequest(queries=[query])
        try:
            snapshot = Snapshot(self._client, request)
        except:
            LOG.exception('error sending request %s', request)
            return False
        if not self._make_runnable(workflow, instance):
            return False
        if not self._has_no_runnable_jobs(workflow, instance):
            return False
        try:
            return not snapshot.refresh()
        except:
            LOG.exception('error sending request %s', request)
            return False

    def _process_signals(self, workflow, instance):
        """Process signals for a given workflow instance.

        Args:
            workflow: The workflow whose signals should be processed.
            instance: The instance whose signals should be processed.
        Returns:
            True if the worker should execute jobs in this instance.  Otherwise
            False.
        """
        signaller = Signaller(self._client, workflow, instance)
        archiver = Archiver(self._client, workflow, instance)
        if signaller.is_action_set(Signal.EXIT):
            return False
        if (signaller.is_action_set(Signal.ARCHIVE)
                and self._is_done(workflow, instance)):
            # TODO(pawel): enable this for all workflows after we gain
            # confidence that the master has enough memory to delay workflow
            # archiving.
            if workflow == 'indexing':
                ARCHIVE_DELAY_SEC = 7 * 24 * 60 * 60  # 7 days
            else:
                ARCHIVE_DELAY_SEC = 12 * 60 * 60  # 12 hours
            expiration_timestamp = int(time.time()) + ARCHIVE_DELAY_SEC
            if signaller.set_attribute_if_missing(Signal.ARCHIVE,
                                                  Signal.TIMESTAMP_ATTR,
                                                  expiration_timestamp):
                self._send_instance_end_email(workflow, instance)
            else:
                expiration_timestamp = signaller.get_attribute(
                    Signal.ARCHIVE, Signal.TIMESTAMP_ATTR)
                archiver.archive_if_expired(expiration_timestamp)
            return False
        if signaller.is_action_set(Signal.ABORT):
            if archiver.archive_if_aborted():
                self._send_instance_end_email(workflow, instance)
            return False
        if signaller.is_action_set(Signal.DRAIN):
            return False
        return True

    def _query_and_own_runnable_job_token(self, workflow, instance):
        """Attempt to own a runnable job token from a given workflow instance.

        Try to own a runnable job token in a given workflow instance.  The
        ownership of the qualifying job token lasts for a limited time so it
        has to be periodically renewed.

        Args:
            workflow: The name of the workflow whose jobs should be considered.
            instance: The workflow instance whose jobs should be considered.
        """
        assert not self._owned_job_token
        name = Name(workflow=workflow,
                    instance=instance,
                    job_state=Name.RUNNABLE_STATE)
        query = Query()
        query.namePrefix = name.get_job_state_prefix()
        query.maxTokens = 1
        request = QueryAndOwnRequest()
        request.query = query
        request.expirationTime = time.time() + Worker._LEASE_TIME_SEC
        request.owner = self._name
        try:
            response = self._client.query_and_own(request)
            if response.tokens:
                assert len(response.tokens) == 1
                self._owned_job_token = response.tokens[0]
        except TokenMasterException:
            LOG.exception('error sending request %s', request)

    def _own_runnable_job_token(self):
        """Attempt to own a runnable job token from any workflow."""
        assert not self._owned_job_token
        workflow_names = self._inspector.get_workflow_names()
        # Shuffle workflows to address starvation.
        random.shuffle(workflow_names)
        for workflow in workflow_names:
            instances = self._inspector.get_workflow_instances(workflow)
            time.sleep(Worker._INTER_QUERY_DELAY_SEC)
            random.shuffle(instances)
            for instance in instances:
                if self._process_signals(workflow, instance):
                    self._make_runnable(workflow, instance)
                    self._query_and_own_runnable_job_token(workflow, instance)
                    if self._owned_job_token:
                        return
            time.sleep(Worker._INTER_QUERY_DELAY_SEC)

    def _abort(self):
        """Abort the running job."""
        assert self._executor
        self._executor.abort()

    def _process_abort_signals(self):
        """Check if the running job should be aborted.

        Returns:
            False iff the job has been aborted.
        """
        name = Name.from_job_token_name(self._owned_job_token.name)
        abort = False
        try:
            signaller = Signaller(self._client, name.workflow, name.instance)
            abort = signaller.is_action_set(Signal.ABORT)
        except (TTransport.TTransportException, socket.timeout, socket.error):
            # We need this exception handler only in logic located in the
            # Timer thread.  If that thread fails, we should abort the process
            # and let the main thread decide what to do.
            LOG.exception('')
            abort = True
        if abort:
            self._abort()
        return not abort

    def _refresh_job_properties(self):
        """Record job properties in the master if they changed.

        If there are communication issues with the master, the running job
        gets aborted.

        Returns:
            False iff there was an error during communication with the master.
        """
        assert self._executor
        if self._executor.job_dirty:
            # The ordering here is important - we need to reset the changed
            # flag before updating the token.
            self._executor.job_dirty = False
            self._owned_job_token.data = pickle.dumps(self._executor.job)
            if not self._update_owned_job_token():
                self._abort()
                return False
        return True

    def _renew_ownership(self):
        """Periodic job token ownership renewal routine."""
        assert self._owned_job_token

        if not self._process_abort_signals():
            return

        if not self._refresh_job_properties():
            return

        now = time.time()
        if (self._owned_job_token.expirationTime <
                now + Worker._LEASE_TIME_SEC / 2):
            self._owned_job_token.expirationTime = (now +
                                                    Worker._LEASE_TIME_SEC)
            if not self._update_owned_job_token():
                self._abort()
                return

        with self._lock:
            if self._lease_renewer:
                self._lease_renewer = threading.Timer(
                    Worker._randomized_worker_polling_time(),
                    self._renew_ownership)
                self._lease_renewer.start()

    def _start_renew_ownership(self):
        """Start periodic renewal of the claimed job token ownership."""
        assert not self._lease_renewer
        self._lease_renewer = threading.Timer(
            Worker._randomized_worker_polling_time(), self._renew_ownership)
        self._lease_renewer.start()

    def _stop_renew_ownership(self):
        """Stop periodic renewal of the claimed job token ownership."""
        with self._lock:
            assert self._lease_renewer
            self._lease_renewer.cancel()
            lease_renewer = self._lease_renewer
            self._lease_renewer = None
        lease_renewer.join()

    def _send_request(self, request):
        """Send a modify request to the master.

        Args:
            request: The modify request to send.
        Returns:
            True on success, otherwise False.
        """
        try:
            self._client.modify(request)
            return True
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False

    def _get_output_event_tokens(self, job):
        """Create output event tokens for the owned job token.

        Args:
            job: The job which output tokens should be generated.
        Returns:
            A list of event tokens corresponding to the outputs of the owned
            job token.
        """
        assert self._owned_job_token
        job_name = Name.from_job_token_name(self._owned_job_token.name)
        output_name = Name()
        output_name.workflow = job_name.workflow
        output_name.instance = job_name.instance
        output_name.input = job_name.job
        event_tokens = []
        for output in job.outputs:
            output_name.job = output
            output_name.event = get_unique_name()
            event = Event(creator=self._name)
            assert job.history
            execution_record = job.history[-1]
            event.attributes = execution_record.get_event_attributes()
            event_tokens.append(
                Token(name=output_name.get_event_token_name(),
                      data=pickle.dumps(event)))
        return event_tokens

    def _move_job_token_to_waiting(self, job, succeeded):
        """Move the owned job token to the waiting group.

        If the job succeeded, also post events to job outputs.  If the job
        failed or it is the final job (a job with no outputs),  post an archive
        signal to finish the workflow.

        Args:
            job: The job that should be stored in the data field of the waiting
                job token.
            succeeded: True if the job succeeded, otherwise False.
        """
        assert self._owned_job_token
        name = Name.from_job_token_name(self._owned_job_token.name)
        name.job_state = Name.WAITING_STATE
        waiting_job_token = Token(name=name.get_job_token_name(),
                                  priority=self._owned_job_token.priority,
                                  data=pickle.dumps(job))
        request = ModifyRequest(deletes=[self._owned_job_token],
                                updates=[waiting_job_token])
        if succeeded:
            request.updates.extend(self._get_output_event_tokens(job))
        if not job.outputs or not succeeded:
            # This is either the only job in the workflow with no outputs or a
            # failed job.  In either case, the workflow is done.
            signaller = Signaller(self._client,
                                  workflow=name.workflow,
                                  instance=name.instance)
            if not signaller.is_action_set(Signal.ARCHIVE):
                signal_name = Name(workflow=name.workflow,
                                   instance=name.instance,
                                   signal=Signal.action_to_string(
                                       Signal.ARCHIVE))
                signal = Signal(Signal.ARCHIVE)
                signal_token = Token(name=signal_name.get_signal_token_name())
                signal_token.data = pickle.dumps(signal)
                request.updates.append(signal_token)
        self._send_request(request)

    def _unown(self, token):
        """Reset the ownership of a token.

        Args:
            token: The token whose ownership should be reset.
        """
        token.owner = None
        token.expirationTime = None

    def _keep_job_token_in_runnable(self, job):
        """Keep the owned job token in the runnable group.

        Refresh the job token data field with the provided job object, release
        the ownership of the token, and return it to the runnable group.

        Args:
            job: The job that should be stored in the data field of the job
                token.
        """
        assert self._owned_job_token
        request = ModifyRequest()
        self._owned_job_token.data = pickle.dumps(job)
        retry_delay_sec = job.retry_delay_sec
        if retry_delay_sec > 0:
            self._owned_job_token.expirationTime = (time.time() +
                                                    retry_delay_sec)
        else:
            self._unown(self._owned_job_token)
        request.updates = [self._owned_job_token]
        self._send_request(request)

    def _update_owned_job_token(self):
        """Update owned job token in the master.

        Returns:
            True if the update was successful, otherwise False.
        """
        assert self._owned_job_token
        request = ModifyRequest()
        request.updates = [self._owned_job_token]
        try:
            response = self._client.modify(request)
        except TokenMasterException:
            LOG.exception('error sending request %s', request)
            return False
        assert len(response.updates) == 1
        self._owned_job_token = response.updates[0]
        return True

    def _execute_job(self):
        """Execute the owned job."""
        assert self._owned_job_token
        job = pickle.loads(self._owned_job_token.data)
        name = Name.from_job_token_name(self._owned_job_token.name)
        self._executor = JobExecutor.from_job(name.workflow, name.instance,
                                              name.job, job,
                                              self._data_builder,
                                              self._emailer)
        success = self._executor.prepare()
        if success:
            self._owned_job_token.data = pickle.dumps(self._executor.job)
            success = self._update_owned_job_token()
            if success:
                self._start_renew_ownership()
                success = self._executor.execute()
                self._stop_renew_ownership()
        if success:
            self._move_job_token_to_waiting(self._executor.job, True)
        elif self._executor.job.retry():
            self._keep_job_token_in_runnable(self._executor.job)
        else:
            signaller = Signaller(self._client, name.workflow, name.instance)
            # If ARCHIVE is not set, this is the first failed job in the
            # workflow.
            first_failure = not signaller.is_action_set(Signal.ARCHIVE)
            self._move_job_token_to_waiting(self._executor.job, False)
            self._send_job_failure_emails(first_failure)
        self._executor = None
        self._owned_job_token = None
        # If needed, archive the workflow.
        self._process_signals(name.workflow, name.instance)

    def _send_instance_end_email(self, workflow, instance):
        try:
            schedule_data = self._data_builder.get_schedule(workflow)
            if not schedule_data:
                LOG.warning('no schedule found for workflow %s', workflow)
            elif schedule_data.emails:
                instance_data = self._data_builder.get_instance(
                    workflow, instance)
                jobs_data = self._data_builder.get_jobs(workflow, instance)
                self._emailer.send_instance_end_message(
                    schedule_data.emails, instance_data, jobs_data)
        except:
            LOG.exception(
                'error sending instance end email for workflow %s '
                'instance %s', workflow, instance)

    def _send_job_failure_emails(self, first_failure):
        assert self._owned_job_token
        name = Name.from_job_token_name(self._owned_job_token.name)
        job = self._executor.job
        emails = set(job.emails)
        if first_failure:
            schedule_data = self._data_builder.get_schedule(name.workflow)
            if schedule_data:
                emails.update(schedule_data.emails)
            else:
                LOG.warning('no schedule found for workflow %s', name.workflow)
        if emails:
            execution = len(job.history) - 1
            job_execution_data = self._data_builder.get_execution(
                name.workflow, name.instance, name.job, execution)
            try:
                self._emailer.send_job_execution_end_message(
                    list(emails), job_execution_data)
            except:
                LOG.exception(
                    'error sending job failure email for '
                    'workflow %s instance %s job %s execution %d',
                    name.workflow, name.instance, name.job, execution)

    @staticmethod
    def _randomized_worker_polling_time():
        """Generate random worker polling time."""
        return (1.0 + random.random()) * PinballConfig.WORKER_POLL_TIME_SEC

    def run(self):
        """Run the worker."""
        LOG.info('Running worker ' + self._name)
        while True:
            signaller = Signaller(self._client)
            if signaller.is_action_set(Signal.EXIT):
                return
            if not signaller.is_action_set(Signal.DRAIN):
                self._own_runnable_job_token()
            if self._owned_job_token:
                self._execute_job()
            elif self._test_only_end_if_no_runnable:
                return
            else:
                time.sleep(Worker._randomized_worker_polling_time())
        LOG.info('Exiting worker ' + self._name)