Пример #1
0
    def Start(self):
        """Starts the Job and updates it in the Datastore.

    This method is designed to return fast, so that Job creation is responsive
    to the user. It schedules the Job on the task queue without running
    anything. It also posts a bug comment, and updates the Datastore.
    """
        if self.use_execution_engine:
            # Treat this as if it's a poll, and run the handler here.
            try:
                task_module.Evaluate(
                    self,
                    event_module.Event(type='initiate',
                                       target_task=None,
                                       payload={}),
                    task_evaluator.ExecutionEngine(self)),
            except task_module.Error as error:
                logging.error('Failed: %s', error)
                self.Fail()
                self.put()
                return
        else:
            self._Schedule()
        self.started = True
        self.started_time = datetime.datetime.now()
        self.put()

        title = _ROUND_PUSHPIN + ' Pinpoint job started.'
        comment = '\n'.join((title, self.url))
        deferred.defer(_PostBugCommentDeferred,
                       self.bug_id,
                       comment,
                       send_email=True,
                       _retry_options=RETRY_OPTIONS)
Пример #2
0
 def post(self, job_id):
     job = job_module.JobFromId(job_id)
     if job.use_execution_engine:
         event = event_module.Event(type='initiate',
                                    target_task=None,
                                    payload={})
         logging.info('Execution Engine: Evaluating initiate event.')
         task_module.Evaluate(job, event, evaluator.ExecutionEngine(job))
         logging.info('Execution Engine: Evaluation done.')
     else:
         job.Run()
Пример #3
0
    def Run(self):
        """Runs this Job.

    Loops through all Attempts and checks the status of each one, kicking off
    tasks as needed. Does not block to wait for all tasks to finish. Also
    compares adjacent Changes' results and adds any additional Attempts or
    Changes as needed. If there are any incomplete tasks, schedules another
    Run() call on the task queue.
    """
        self.exception_details = None  # In case the Job succeeds on retry.
        self.task = None  # In case an exception is thrown.

        try:
            if self.use_execution_engine:
                # Treat this as if it's a poll, and run the handler here.
                context = task_module.Evaluate(
                    self,
                    event_module.Event(type='initiate',
                                       target_task=None,
                                       payload={}),
                    task_evaluator.ExecutionEngine(self))
                result_status = context.get('performance_bisection',
                                            {}).get('status')
                if result_status not in {'failed', 'completed'}:
                    return

                if result_status == 'failed':
                    execution_errors = context['find_culprit'].get(
                        'errors', [])
                    if execution_errors:
                        self.exception_details = execution_errors[0]

                self._Complete()
                return

            if not self._IsTryJob():
                self.state.Explore()
            work_left = self.state.ScheduleWork()

            # Schedule moar task.
            if work_left:
                self._Schedule()
            else:
                self._Complete()

            self.retry_count = 0
        except errors.RecoverableError as e:
            try:
                if not self._MaybeScheduleRetry():
                    self.Fail(errors.JobRetryLimitExceededError(wrapped_exc=e))
            except errors.RecoverableError as e:
                self.Fail(errors.JobRetryFailed(wrapped_exc=e))
        except BaseException:
            self.Fail()
            raise
        finally:
            # Don't use `auto_now` for `updated`. When we do data migration, we need
            # to be able to modify the Job without changing the Job's completion time.
            self.updated = datetime.datetime.now()

            if self.completed:
                timing_record.RecordJobTiming(self)

            try:
                self.put()
            except (datastore_errors.Timeout,
                    datastore_errors.TransactionFailedError):
                # Retry once.
                self.put()
            except datastore_errors.BadRequestError:
                if self.task:
                    queue = taskqueue.Queue('job-queue')
                    queue.delete_tasks(taskqueue.Task(name=self.task))
                self.task = None

                # The _JobState is too large to fit in an ndb property.
                # Load the Job from before we updated it, and fail it.
                job = self.key.get(use_cache=False)
                job.task = None
                job.Fail()
                job.updated = datetime.datetime.now()
                job.put()
                raise
Пример #4
0
    def post(self):
        """Handle push messages including information about the swarming task."""
        try:
            # Read the JSON body of the message, as how Pub/Sub will use.
            body = json.loads(self.request.body)
            message = body.get('message')
            if not message:
                raise ValueError('Cannot find `message` in the request: %s' %
                                 (body, ))

            # Load the base64-encoded data in the message, which should include the
            # following information:
            #   - job id
            #   - task id
            #   - additional task-specific details
            swarming_data = json.loads(
                base64.standard_b64decode(message.get('data', '')))
            logging.debug('Received: %s', swarming_data)

            # From the swarming data, we can determine the job id and task id (if
            # there's any) which we can handle appropriately. Swarming will send a
            # message of the form:
            #
            #   {
            #     "task_id": <swarming task id>
            #     "userdata": <base64 encoded data>
            #   }
            #
            # In the 'userdata' field we can then use details to use the execution
            # engine, if the job is meant to be executed with the engine.
            userdata = swarming_data.get('userdata')
            if not userdata:
                raise ValueError('Ill-formed swarming update: %s' %
                                 (swarming_data, ))

            pinpoint_data = json.loads(base64.urlsafe_b64decode(userdata))
            job_id = pinpoint_data.get('job_id')
            if not job_id:
                raise ValueError('Missing job_id from pinpoint data.')

            job = job_module.JobFromId(job_id)
            if not job:
                raise ValueError('Failed to find job with ID = %s' %
                                 (job_id, ))

            # If we're not meant to use the execution engine, bail out early.
            if not job.use_execution_engine:
                self.response.status = 204
                self.response.write('')
                return

            task_data = pinpoint_data.get('task')
            if not task_data:
                raise ValueError('Missing "task" field in the payload')

            # For build events, we follow the convention used by the evaluators that
            # react to build events.
            event = None
            task_type = task_data.get('type')
            task_id = task_data.get('id')
            payload = {}
            if task_type == 'build':
                payload = {'status': 'build_completed'}
            event = event_module.Event(type='update',
                                       target_task=task_id,
                                       payload=payload)

            # From here, we have enough information to evaluate the task graph.
            try:
                accumulator = task_module.Evaluate(
                    job, event, evaluator.ExecutionEngine(job))

                # Then decide to update the Job if we find a terminal state from the
                # root 'find_culprit' node.
                if 'find_culprit' not in accumulator:
                    raise ValueError(
                        'Missing "find_culprit" in task graph for job with ID = %s'
                        % (job_id, ))

                result_status = accumulator['find_culprit']['status']
                if result_status in {'failed', 'completed'}:
                    # TODO(dberris): Formalise the error collection/propagation mechanism
                    # for exposing all errors in the UX, when we need it.
                    job.exception_details = accumulator['find_culprit'][
                        'errors'][0]
                    job._Complete()
            except task_module.Error as error:
                logging.error('Failed: %s', error)
                job.Fail()
                job.put()
                raise

        except (ValueError, binascii.Error) as error:
            logging.error('Failed: %s', error)

        self.response.status = 204
        self.response.write('')
Пример #5
0
def HandleTaskUpdate(request_body):
    # Read the JSON body of the message, as how Pub/Sub will use.
    try:
        body = json.loads(request_body)
    except ValueError as error:
        raise ValueError('Failed JSON parsing request body: %s (%s)' %
                         (error, request_body[:40] + '...'))

    message = body.get('message')
    if not message:
        raise ValueError('Cannot find `message` in the request: %s' % (body, ))

    # Load the base64-encoded data in the message, which should include the
    # following information:
    #   - job id
    #   - task id
    #   - additional task-specific details
    data = message.get('data', '')
    if not data:
        raise ValueError('Missing data field in `message`: %s' % (message, ))

    try:
        decoded_data = base64.urlsafe_b64decode(data.encode('utf-8'))
    except TypeError as error:
        raise ValueError('Failed decoding `data` field in `message`: %s (%s)' %
                         (error, data))

    try:
        swarming_data = json.loads(decoded_data)
    except ValueError as error:
        raise ValueError(
            'Failed JSON parsing `data` field in `message`: %s (%s)' %
            (error, data))
    logging.debug('Received: %s', swarming_data)

    # From the swarming data, we can determine the job id and task id (if
    # there's any) which we can handle appropriately. Swarming will send a
    # message of the form:
    #
    #   {
    #     "task_id": <swarming task id>
    #     "userdata": <base64 encoded data>
    #   }
    #
    # In the 'userdata' field we can then use details to use the execution
    # engine, if the job is meant to be executed with the engine.
    userdata = swarming_data.get('userdata')
    if not userdata:
        raise ValueError('Ill-formed swarming update: %s' % (swarming_data, ))

    pinpoint_data = json.loads(userdata)
    job_id = pinpoint_data.get('job_id')
    if not job_id:
        raise ValueError('Missing job_id from pinpoint data.')

    job = job_module.JobFromId(job_id)
    if not job:
        raise ValueError('Failed to find job with ID = %s' % (job_id, ))

    # If we're not meant to use the execution engine, bail out early.
    if not job.use_execution_engine:
        return

    task_data = pinpoint_data.get('task')
    if not task_data:
        raise ValueError('Missing "task" field in the payload')

    # For build events, we follow the convention used by the evaluators that
    # react to build events.
    event = None
    task_type = task_data.get('type')
    task_id = task_data.get('id')
    payload = {}
    if task_type == 'build':
        payload = {'status': 'build_completed'}
    event = event_module.Event(type='update',
                               target_task=task_id,
                               payload=payload)

    # From here, we have enough information to evaluate the task graph.
    try:
        accumulator = task_module.Evaluate(job, event,
                                           evaluator.ExecutionEngine(job))

        # Then decide to update the Job if we find a terminal state from the
        # root 'find_culprit' node.
        if 'performance_bisection' not in accumulator:
            raise ValueError(
                'Missing "performance_bisection" in task graph for job with ID = %s'
                % (job_id, ))

        result_status = accumulator['performance_bisection'].get('status')
        if result_status in {'failed', 'completed'}:
            # TODO(dberris): Formalise the error collection/propagation mechanism
            # for exposing all errors in the UX, when we need it.
            execution_errors = accumulator['performance_bisection'].get(
                'errors', [])
            if execution_errors:
                job.Fail(errors.ExecutionEngineErrors(execution_errors))
            elif job_module.IsDone(job.job_id):
                job._Complete()

        # At this point, update the job's updated field transactionally.
        job_module.UpdateTime(job.job_id)
    except task_module.Error as error:
        logging.error('Failed: %s', error)
        job.Fail()
        raise