예제 #1
0
파일: mixin.py 프로젝트: Affirm/mrjob
    def _ls_task_logs(self, step_type,
                      application_id=None, job_id=None, output_dir=None,
                      error_attempt_ids=None, attempt_to_container_id=None):
        """Yield task log matches."""
        if not self._read_logs():
            return

        if _is_spark_step_type(step_type):
            ls_func = _ls_spark_task_logs
        else:
            ls_func = _ls_task_logs

        # logging messages are handled by a callback in _interpret_task_logs()
        matches = ls_func(
            self.fs,
            self._stream_task_log_dirs(
                application_id=application_id, output_dir=output_dir),
            application_id=application_id,
            job_id=job_id,
            error_attempt_ids=error_attempt_ids,
            attempt_to_container_id=attempt_to_container_id,
        )

        for match in matches:
            yield match
예제 #2
0
 def _run_step(self, step, step_num):
     """Run an individual step. You can assume that setup wrapper scripts
     are created and self._counters has a dictionary for that step already.
     """
     if _is_spark_step_type(step['type']):
         self._run_step_on_spark(step, step_num)
     else:
         self._run_streaming_step(step, step_num)
예제 #3
0
파일: runner.py 프로젝트: Affirm/mrjob
    def _has_spark_steps(self):
        """Are any of our steps Spark steps? (e.g. spark, spark_jar,
        spark_script)

        Generally used to determine if we need to install Spark on a cluster.
        """
        return any(_is_spark_step_type(step['type'])
                   for step in self._get_steps())
예제 #4
0
    def _has_spark_steps(self):
        """Are any of our steps Spark steps? (e.g. spark, spark_jar,
        spark_script)

        Generally used to determine if we need to install Spark on a cluster.
        """
        return any(
            _is_spark_step_type(step['type']) for step in self._get_steps())
예제 #5
0
파일: sim.py 프로젝트: Affirm/mrjob
 def _run_step(self, step, step_num):
     """Run an individual step. You can assume that setup wrapper scripts
     are created and self._counters has a dictionary for that step already.
     """
     if _is_spark_step_type(step['type']):
         self._run_step_on_spark(step, step_num)
     else:
         self._run_streaming_step(step, step_num)
예제 #6
0
 def _warn_about_spark_archives(self, step):
     """If *step* is a Spark step, the *upload_archives* option is set,
     and *spark_master* is not ``'yarn'``, warn that *upload_archives*
     will be ignored by Spark."""
     if (_is_spark_step_type(step['type'])
             and self._opts['spark_master'] != 'yarn'
             and self._opts['upload_archives']):
         log.warning('Spark will probably ignore archives because'
                     " spark_master is not set to 'yarn'")
예제 #7
0
파일: hadoop.py 프로젝트: okomestudio/mrjob
 def _warn_about_spark_archives(self, step):
     """If *step* is a Spark step, the *upload_archives* option is set,
     and *spark_master* is not ``'yarn'``, warn that *upload_archives*
     will be ignored by Spark."""
     if (_is_spark_step_type(step['type']) and
             self._opts['spark_master'] != 'yarn' and
             self._opts['upload_archives']):
         log.warning('Spark will probably ignore archives because'
                     " spark_master is not set to 'yarn'")
예제 #8
0
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        if not _is_spark_step_type(step['type']):
            raise TypeError('non-Spark step: %r' % step)

        args = []

        # add --master
        if self._spark_master():
            args.extend(['--master', self._spark_master()])

        # add --deploy-mode
        if self._spark_deploy_mode():
            args.extend(['--deploy-mode', self._spark_deploy_mode()])

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            args.extend(['--conf', '%s=%s' % (key, value)])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        if step['type'] in ('spark', 'spark_script'):
            py_file_uris = self._upload_uris(self._py_files())
            if py_file_uris:
                args.extend(['--py-files', ','.join(py_file_uris)])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        if step.get('spark_args'):
            args.extend(step['spark_args'])

        return args
예제 #9
0
    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env
예제 #10
0
파일: mixin.py 프로젝트: zhiaozhou/mrjob
    def _log_counters(self, log_interpretation, step_num):
        """Utility for logging counters (if any) for a step."""
        step_type = self._get_step(step_num)['type']

        if not _is_spark_step_type(step_type):
            counters = self._pick_counters(log_interpretation, step_type)
            if counters:
                log.info(_format_counters(counters))
            else:
                log.warning('No counters found')
예제 #11
0
파일: hadoop.py 프로젝트: okomestudio/mrjob
    def _env_for_step(self, step_num):
        step = self._get_step(step_num)

        env = dict(os.environ)

        # when running spark-submit, set its environment directly. See #1464
        if _is_spark_step_type(step['type']):
            env.update(self._spark_cmdenv(step_num))

        return env
예제 #12
0
    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'], ))
예제 #13
0
파일: hadoop.py 프로젝트: okomestudio/mrjob
    def _args_for_step(self, step_num):
        step = self._get_step(step_num)

        if step['type'] == 'streaming':
            return self._args_for_streaming_step(step_num)
        elif step['type'] == 'jar':
            return self._args_for_jar_step(step_num)
        elif _is_spark_step_type(step['type']):
            return self._args_for_spark_step(step_num)
        else:
            raise AssertionError('Bad step type: %r' % (step['type'],))
예제 #14
0
파일: mixin.py 프로젝트: espenwiik91/BDEM
    def _log_counters(self, log_interpretation, step_num):
        """Utility for logging counters (if any) for a step."""
        step_type = self._get_step(step_num)['type']

        if not _is_spark_step_type(step_type):
            counters = self._pick_counters(
                log_interpretation, step_type)
            if counters:
                log.info(_format_counters(counters))
            elif self._read_logs():
                # should only log this if we actually looked for counters
                log.warning('No counters found')
예제 #15
0
파일: bin.py 프로젝트: okomestudio/mrjob
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        if not _is_spark_step_type(step['type']):
            raise TypeError('non-Spark step: %r' % step)

        args = []

        # add runner-specific args
        args.extend(self._spark_submit_arg_prefix())

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            if value is not None:
                args.extend(['--conf', '%s=%s' % (key, value)])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        if step['type'] in ('spark', 'spark_script'):
            py_files_arg = ','.join(self._spark_py_files())
            if py_files_arg:
                args.extend(['--py-files', py_files_arg])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        args.extend(step['spark_args'])

        return args
예제 #16
0
파일: mixin.py 프로젝트: espenwiik91/BDEM
    def _interpret_task_logs(
            self, log_interpretation, step_type, error_attempt_ids=(),
            partial=True):
        """Fetch task syslogs and stderr, and add 'task' to interpretation."""
        if 'task' in log_interpretation and (
                partial or not log_interpretation['task'].get('partial')):
            return   # already interpreted

        if not self._read_logs():
            return

        step_interpretation = log_interpretation.get('step') or {}

        application_id = step_interpretation.get('application_id')
        job_id = step_interpretation.get('job_id')
        output_dir = step_interpretation.get('output_dir')

        yarn = uses_yarn(self.get_hadoop_version())

        attempt_to_container_id = log_interpretation.get('history', {}).get(
            'attempt_to_container_id', {})

        if yarn:
            if not application_id:
                if not log_interpretation.get('no_job'):
                    log.warning(
                        "Can't fetch task logs; missing application ID")
                return
        else:
            if not job_id:
                if not log_interpretation.get('no_job'):
                    log.warning("Can't fetch task logs; missing job ID")
                return

        if _is_spark_step_type(step_type):
            interpret_func = _interpret_spark_task_logs
        else:
            interpret_func = _interpret_task_logs

        log_interpretation['task'] = interpret_func(
            self.fs,
            self._ls_task_logs(
                step_type,
                application_id=application_id,
                job_id=job_id,
                output_dir=output_dir,
                error_attempt_ids=error_attempt_ids,
                attempt_to_container_id=attempt_to_container_id,
            ),
            partial=partial,
            log_callback=_log_parsing_task_log)
예제 #17
0
파일: mixin.py 프로젝트: davidmarin/mrjob
    def _ls_task_logs(self, step_type,
                      application_id=None, job_id=None, output_dir=None):
        """Yield task log matches."""
        if _is_spark_step_type(step_type):
            ls_func = _ls_spark_task_logs
        else:
            ls_func = _ls_task_logs

        # logging messages are handled by a callback in _interpret_task_logs()
        for match in ls_func(
                self.fs,
                self._stream_task_log_dirs(
                    application_id=application_id, output_dir=output_dir),
                application_id=application_id,
                job_id=job_id):
            yield match
예제 #18
0
파일: mixin.py 프로젝트: zhiaozhou/mrjob
    def _pick_counters(self, log_interpretation, step_type):
        """Pick counters from our log interpretation, interpreting
        history logs if need be."""
        if _is_spark_step_type(step_type):
            return {}

        counters = _pick_counters(log_interpretation)

        if not counters:
            log.info('Attempting to fetch counters from logs...')
            self._interpret_step_logs(log_interpretation, step_type)
            counters = _pick_counters(log_interpretation)

        if not counters:
            self._interpret_history_log(log_interpretation)
            counters = _pick_counters(log_interpretation)

        return counters
예제 #19
0
파일: mixin.py 프로젝트: davidmarin/mrjob
    def _pick_counters(self, log_interpretation, step_type):
        """Pick counters from our log interpretation, interpreting
        history logs if need be."""
        if _is_spark_step_type(step_type):
            return {}

        counters = _pick_counters(log_interpretation)

        if not counters:
            log.info('Attempting to fetch counters from logs...')
            self._interpret_step_logs(log_interpretation, step_type)
            counters = _pick_counters(log_interpretation)

        if not counters:
            self._interpret_history_log(log_interpretation)
            counters = _pick_counters(log_interpretation)

        return counters
예제 #20
0
    def _ls_task_logs(self,
                      step_type,
                      application_id=None,
                      job_id=None,
                      output_dir=None):
        """Yield task log matches."""
        if _is_spark_step_type(step_type):
            ls_func = _ls_spark_task_logs
        else:
            ls_func = _ls_task_logs

        # logging messages are handled by a callback in _interpret_task_logs()
        for match in ls_func(self.fs,
                             self._stream_task_log_dirs(
                                 application_id=application_id,
                                 output_dir=output_dir),
                             application_id=application_id,
                             job_id=job_id):
            yield match
예제 #21
0
파일: hadoop.py 프로젝트: okomestudio/mrjob
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
예제 #22
0
 def _has_spark_steps(self):
     """Are any of our steps Spark steps (either spark or spark_script)"""
     return any(
         _is_spark_step_type(step['type']) for step in self._get_steps())
예제 #23
0
파일: runner.py 프로젝트: okomestudio/mrjob
 def _has_spark_steps(self):
     """Are any of our steps Spark steps (either spark or spark_script)"""
     return any(_is_spark_step_type(step['type'])
                for step in self._get_steps())
예제 #24
0
 def _run_step(self, step, step_num):
     if _is_spark_step_type(step['type']):
         self._run_step_on_spark(step, step_num)
     else:
         super(LocalMRJobRunner, self)._run_step(step, step_num)
예제 #25
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())