예제 #1
0
  def _GetResult(self, batch):
    """Handles errors.

    Error handling for batch jobs. This happen after the batch reaches one of
    the complete states.

    Overrides.

    Args:
      batch: The batch resource.

    Returns:
      None. The result is directly output to log.err.

    Raises:
      JobTimeoutError: When waiter timed out.
      JobError: When remote batch job is failed.
    """
    if not batch:
      # Batch resource is None but polling is considered done.
      # This only happens when the waiter timed out.
      raise exceptions.JobTimeoutError(
          'Timed out while waiting for batch job.')

    if (batch.state ==
        self.dataproc.messages.Batch.StateValueValuesEnum.SUCCEEDED):
      if not self.driver_log_streamer:
        log.warning('Expected batch job output not found.')
      elif self.driver_log_streamer.open:
        # Remote output didn't end correctly.
        log.warning('Batch job terminated, but output did not finish '
                    'streaming.')
    elif (batch.state ==
          self.dataproc.messages.Batch.StateValueValuesEnum.CANCELLED):
      log.warning('Batch job is CANCELLED.')
    else:
      err_message = 'Batch job is FAILED.'
      if batch.stateMessage:
        err_message = '{} Detail: {}'.format(err_message, batch.stateMessage)
        if err_message[-1] != '.':
          err_message += '.'
      err_message += '\n'
      err_message += (
          'Running auto diagnostics on the batch. It may take few '
          'minutes before diagnostics output is available. Please '
          'check diagnostics output by running \'gcloud dataproc '
          'batches describe\' command.')
      raise exceptions.JobError(err_message)

    # Nothing to return, since the result is directly output to users.
    return None
예제 #2
0
def WaitForJobTermination(job,
                          context,
                          message,
                          goal_state,
                          stream_driver_log=False,
                          log_poll_period_s=1,
                          dataproc_poll_period_s=10,
                          timeout_s=None):
    """Poll dataproc Job until its status is terminal or timeout reached.

  Args:
    job: The job to wait to finish.
    context: dict, dataproc Command context.
    message: str, message to display to user while polling.
    goal_state: JobStatus.StateValueValuesEnum, the state to define success
    stream_driver_log: bool, Whether to show the Job's driver's output.
    log_poll_period_s: number, delay in seconds between checking on the log.
    dataproc_poll_period_s: number, delay in seconds between requests to
        the Dataproc API.
    timeout_s: number, time out for job completion. None means no timeout.

  Returns:
    Operation: the return value of the last successful operations.get
    request.

  Raises:
    OperationError: if the operation times out or finishes with an error.
  """
    client = context['dataproc_client']
    job_ref = ParseJob(job.reference.jobId, context)
    request = client.MESSAGES_MODULE.DataprocProjectsRegionsJobsGetRequest(
        projectId=job_ref.projectId,
        region=job_ref.region,
        jobId=job_ref.jobId)
    driver_log_stream = None
    last_job_poll_time = 0
    job_complete = False
    wait_display = None
    driver_output_uri = None

    def ReadDriverLogIfPresent():
        if driver_log_stream and driver_log_stream.open:
            # TODO(b/36049794): Don't read all output.
            driver_log_stream.ReadIntoWritable(log.err)

    def PrintEqualsLine():
        attr = console_attr.GetConsoleAttr()
        log.err.Print('=' * attr.GetTermSize()[0])

    if stream_driver_log:
        log.status.Print('Waiting for job output...')
        wait_display = NoOpProgressDisplay()
    else:
        wait_display = progress_tracker.ProgressTracker(message, autotick=True)
    start_time = now = time.time()
    with wait_display:
        while not timeout_s or timeout_s > (now - start_time):
            # Poll logs first to see if it closed.
            ReadDriverLogIfPresent()
            log_stream_closed = driver_log_stream and not driver_log_stream.open
            if not job_complete and job.status.state in constants.TERMINAL_JOB_STATES:
                job_complete = True
                # Wait an 10s to get trailing output.
                timeout_s = now - start_time + 10

            if job_complete and (not stream_driver_log or log_stream_closed):
                # Nothing left to wait for
                break

            regular_job_poll = (
                not job_complete
                # Poll less frequently on dataproc API
                and now >= last_job_poll_time + dataproc_poll_period_s)
            # Poll at regular frequency before output has streamed and after it has
            # finished.
            expecting_output_stream = stream_driver_log and not driver_log_stream
            expecting_job_done = not job_complete and log_stream_closed
            if regular_job_poll or expecting_output_stream or expecting_job_done:
                last_job_poll_time = now
                try:
                    job = client.projects_regions_jobs.Get(request)
                except apitools_exceptions.HttpError as error:
                    log.warn('GetJob failed:\n{1}', error)
                    # Keep trying until we timeout in case error is transient.
                if (stream_driver_log and job.driverOutputResourceUri
                        and job.driverOutputResourceUri != driver_output_uri):
                    if driver_output_uri:
                        PrintEqualsLine()
                        log.warn(
                            "Job attempt failed. Streaming new attempt's output."
                        )
                        PrintEqualsLine()
                    driver_output_uri = job.driverOutputResourceUri
                    driver_log_stream = storage_helpers.StorageObjectSeriesStream(
                        job.driverOutputResourceUri)
            time.sleep(log_poll_period_s)
            now = time.time()

    # TODO(b/34836493): Get better test coverage of the next 20 lines.
    state = job.status.state
    if state is not goal_state and job.status.details:
        # Just log details, because the state will be in the error message.
        log.info(job.status.details)

    if state in constants.TERMINAL_JOB_STATES:
        if stream_driver_log:
            if not driver_log_stream:
                log.warn('Expected job output not found.')
            elif driver_log_stream.open:
                log.warn(
                    'Job terminated, but output did not finish streaming.')
        if state is goal_state:
            return job
        raise exceptions.JobError(
            'Job [{0}] entered state [{1}] while waiting for [{2}].'.format(
                job_ref.jobId, state, goal_state))
    raise exceptions.JobTimeoutError(
        'Job [{0}] timed out while in state [{1}].'.format(
            job_ref.jobId, state))
예제 #3
0
def WaitForJobTermination(dataproc,
                          job,
                          job_ref,
                          message,
                          goal_state,
                          error_state=None,
                          stream_driver_log=False,
                          log_poll_period_s=1,
                          dataproc_poll_period_s=10,
                          timeout_s=None):
    """Poll dataproc Job until its status is terminal or timeout reached.

  Args:
    dataproc: wrapper for dataproc resources, client and messages
    job: The job to wait to finish.
    job_ref: Parsed dataproc.projects.regions.jobs resource containing a
        projectId, region, and jobId.
    message: str, message to display to user while polling.
    goal_state: JobStatus.StateValueValuesEnum, the state to define success
    error_state: JobStatus.StateValueValuesEnum, the state to define failure
    stream_driver_log: bool, Whether to show the Job's driver's output.
    log_poll_period_s: number, delay in seconds between checking on the log.
    dataproc_poll_period_s: number, delay in seconds between requests to
        the Dataproc API.
    timeout_s: number, time out for job completion. None means no timeout.

  Returns:
    Job: the return value of the last successful jobs.get request.

  Raises:
    JobError: if the job finishes with an error.
  """
    request = dataproc.messages.DataprocProjectsRegionsJobsGetRequest(
        projectId=job_ref.projectId,
        region=job_ref.region,
        jobId=job_ref.jobId)
    driver_log_stream = None
    last_job_poll_time = 0
    job_complete = False
    wait_display = None
    driver_output_uri = None

    def ReadDriverLogIfPresent():
        if driver_log_stream and driver_log_stream.open:
            # TODO(b/36049794): Don't read all output.
            driver_log_stream.ReadIntoWritable(log.err)

    def PrintEqualsLine():
        attr = console_attr.GetConsoleAttr()
        log.err.Print('=' * attr.GetTermSize()[0])

    if stream_driver_log:
        log.status.Print('Waiting for job output...')
        wait_display = NoOpProgressDisplay()
    else:
        wait_display = progress_tracker.ProgressTracker(message, autotick=True)
    start_time = now = time.time()
    with wait_display:
        while not timeout_s or timeout_s > (now - start_time):
            # Poll logs first to see if it closed.
            ReadDriverLogIfPresent()
            log_stream_closed = driver_log_stream and not driver_log_stream.open
            if (not job_complete
                    and job.status.state in dataproc.terminal_job_states):
                job_complete = True
                # Wait an 10s to get trailing output.
                timeout_s = now - start_time + 10

            if job_complete and (not stream_driver_log or log_stream_closed):
                # Nothing left to wait for
                break

            regular_job_poll = (
                not job_complete
                # Poll less frequently on dataproc API
                and now >= last_job_poll_time + dataproc_poll_period_s)
            # Poll at regular frequency before output has streamed and after it has
            # finished.
            expecting_output_stream = stream_driver_log and not driver_log_stream
            expecting_job_done = not job_complete and log_stream_closed
            if regular_job_poll or expecting_output_stream or expecting_job_done:
                last_job_poll_time = now
                try:
                    job = dataproc.client.projects_regions_jobs.Get(request)
                except apitools_exceptions.HttpError as error:
                    log.warning('GetJob failed:\n{}'.format(
                        six.text_type(error)))
                    # Do not retry on 4xx errors.
                    if IsClientHttpException(error):
                        raise
                if (stream_driver_log and job.driverOutputResourceUri
                        and job.driverOutputResourceUri != driver_output_uri):
                    if driver_output_uri:
                        PrintEqualsLine()
                        log.warning(
                            "Job attempt failed. Streaming new attempt's output."
                        )
                        PrintEqualsLine()
                    driver_output_uri = job.driverOutputResourceUri
                    driver_log_stream = storage_helpers.StorageObjectSeriesStream(
                        job.driverOutputResourceUri)
            time.sleep(log_poll_period_s)
            now = time.time()

    # TODO(b/34836493): Get better test coverage of the next 20 lines.
    state = job.status.state

    # goal_state and error_state will always be terminal
    if state in dataproc.terminal_job_states:
        if stream_driver_log:
            if not driver_log_stream:
                log.warning('Expected job output not found.')
            elif driver_log_stream.open:
                log.warning(
                    'Job terminated, but output did not finish streaming.')
        if state is goal_state:
            return job
        if error_state and state is error_state:
            if job.status.details:
                raise exceptions.JobError(
                    'Job [{0}] failed with error:\n{1}'.format(
                        job_ref.jobId, job.status.details))
            raise exceptions.JobError('Job [{0}] failed.'.format(
                job_ref.jobId))
        if job.status.details:
            log.info('Details:\n' + job.status.details)
        raise exceptions.JobError(
            'Job [{0}] entered state [{1}] while waiting for [{2}].'.format(
                job_ref.jobId, state, goal_state))
    raise exceptions.JobTimeoutError(
        'Job [{0}] timed out while in state [{1}].'.format(
            job_ref.jobId, state))