示例#1
0
文件: local.py 项目: wanglt311/mrjob
    def _wait_for_process(self, proc_dict, step_num):
        # handle counters, status msgs, and other stuff on stderr
        proc = proc_dict['proc']

        stderr_lines = self._process_stderr_from_script(proc.stderr,
                                                        step_num=step_num)
        tb_lines = _find_python_traceback(stderr_lines)

        # proc.stdout isn't always defined
        if proc.stdout:
            proc.stdout.close()
        proc.stderr.close()

        returncode = proc.wait()

        if returncode != 0:
            # show counters before raising exception
            counters = self._counters[step_num]
            if counters:
                log.info(_format_counters(counters))

            # try to throw a useful exception
            if tb_lines:
                for line in tb_lines:
                    log.error(line.rstrip('\r\n'))

            reason = str(CalledProcessError(returncode, proc_dict['args']))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=len(self._get_steps()))
示例#2
0
    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job_result = self._api_job_get(job_id)
            job_state = job_result['status']['state']

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            if job_state in _DATAPROC_JOB_STATES_ACTIVE:
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)
示例#3
0
文件: dataproc.py 项目: niejn/mrjob
    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            log.info('%s => %s' % (job_id, job_state))

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING',
                             'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # we're done, will return at the end of this
            elif job_state == 'DONE':
                break

            raise StepFailedException(step_num=step_num, num_steps=num_steps)
示例#4
0
    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives']:
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        step_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        log.debug('> %s' % cmd_line(step_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        def _log_line(line):
            log.info('  %s' % to_unicode(line).strip('\r\n'))

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

            for line in step_proc.stderr:
                for record in _parse_hadoop_log4j_records(
                        _yield_lines_from_pty_or_pipe(step_proc.stderr)):
                    _log_log4j_record(record)

            # there shouldn't be much output on STDOUT
            for line in step_proc.stdout:
                _log_line_from_driver(line)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                os.execvpe(step_args[0], step_args, env)
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    for record in _parse_hadoop_log4j_records(
                            _yield_lines_from_pty_or_pipe(master)):
                        _log_log4j_record(record)
                    _, returncode = os.waitpid(pid, 0)

        if returncode:
            reason = str(CalledProcessError(returncode, step_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=self._num_steps())
示例#5
0
    def _wait_for_step_to_complete(self, job_id, step_num, num_steps):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        log_interpretation['step'] = {}
        step_type = self._get_step(step_num)['type']

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            log.info('%s => %s' % (job_id, job_state))

            log_interpretation['step']['driver_output_uri'] = (
                job.driver_output_resource_uri)

            self._interpret_step_logs(log_interpretation, step_type)

            progress = log_interpretation['step'].get('progress')
            if progress:
                log.info(' ' + progress['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING',
                             'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            if job_state == 'ERROR':
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n\n' %
                              _format_error(error))

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(step_num=step_num,
                                          num_steps=num_steps)
示例#6
0
    def _wait_for_step_to_complete(self,
                                   job_id,
                                   step_num=None,
                                   num_steps=None):
        """Helper for _wait_for_step_to_complete(). Wait for
        step with the given ID to complete, and fetch counters.
        If it fails, attempt to diagnose the error, and raise an
        exception.

        This also adds an item to self._log_interpretations
        """
        log_interpretation = dict(job_id=job_id)
        self._log_interpretations.append(log_interpretation)

        step_interpretation = {}
        log_interpretation['step'] = step_interpretation

        while True:
            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus  # noqa
            job = self._get_job(job_id)

            job_state = job.status.State.Name(job.status.state)

            driver_output_uri = job.driver_output_resource_uri

            log.info('%s => %s' % (job_id, job_state))

            # interpret driver output so far
            if driver_output_uri:
                self._update_step_interpretation(step_interpretation,
                                                 driver_output_uri)

            if step_interpretation.get('progress'):
                log.info(' ' + step_interpretation['progress']['message'])

            # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State  # noqa
            # these are the states covered by the ACTIVE job state matcher,
            # plus SETUP_DONE
            if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING',
                             'SETUP_DONE'):
                self._wait_for_api('job completion')
                continue

            # print counters if job wasn't CANCELLED
            if job_state != 'CANCELLED':
                self._log_counters(log_interpretation, step_num)

            # we're done, will return at the end of this
            if job_state == 'DONE':
                break
            else:
                raise StepFailedException(step_num=step_num,
                                          num_steps=num_steps)
示例#7
0
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(self._counter_output_dir(step_num),
                                        'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(counters,
                                                    start=(step_num + 1)):
                if counter_dict:
                    log.info(
                        _format_counters(counter_dict,
                                         desc=('Counters for step %d' %
                                               desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      last_step_num=last_step_num,
                                      num_steps=self._num_steps())
示例#8
0
    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num,
                num_steps=self._num_steps())
示例#9
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_type = step['type']
            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            if self._step_type_uses_spark(step_type):
                returncode, step_interpretation = self._run_spark_submit(
                    step_args, env, record_callback=_log_log4j_record)
            else:
                returncode, step_interpretation = self._run_hadoop(
                    step_args, env, record_callback=_log_record_from_hadoop)

            # make sure output_dir is filled (used for history log)
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    _log_probable_cause_of_failure(log, error)

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
示例#10
0
文件: local.py 项目: zhoushibo-111/-
    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives']:
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=self._num_steps())
示例#11
0
 def test_step_desc(self):
     ex = StepFailedException(step_desc='Master node setup step')
     self.assertEqual(str(ex), 'Master node setup step failed')
     self.assertEqual(
         repr(ex),
         "StepFailedException(step_desc='Master node setup step')")
示例#12
0
 def test_num_steps_with_no_step_num(self):
     ex = StepFailedException(num_steps=4)
     self.assertEqual(str(ex), 'Step failed')
     self.assertEqual(repr(ex), 'StepFailedException(num_steps=4)')
示例#13
0
 def test_step_num_with_num_steps(self):
     ex = StepFailedException(step_num=0, num_steps=4)
     self.assertEqual(str(ex), 'Step 1 of 4 failed')
     self.assertEqual(repr(ex),
                      'StepFailedException(step_num=0, num_steps=4)')
示例#14
0
 def test_reason(self):
     ex = StepFailedException('Hadoop is feeling sad today')
     self.assertEqual(str(ex), 'Step failed: Hadoop is feeling sad today')
     self.assertEqual(
         repr(ex),
         "StepFailedException(reason='Hadoop is feeling sad today')")
示例#15
0
 def test_empty(self):
     ex = StepFailedException()
     self.assertEqual(str(ex), 'Step failed')
     self.assertEqual(repr(ex), 'StepFailedException()')
示例#16
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())
示例#17
0
 def test_step_num(self):
     ex = StepFailedException(step_num=0)
     self.assertEqual(str(ex), 'Step 1 failed')
     self.assertEqual(repr(ex), 'StepFailedException(step_num=0)')