示例#1
0
文件: sim.py 项目: Affirm/mrjob
    def _env_for_task(self, task_type, step_num, task_num, map_split=None):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        user_jobconf = self._jobconf_for_step(step_num)

        simulated_jobconf = self._simulate_jobconf_for_step(
            task_type, step_num, task_num, map_split)

        def to_env(jobconf):
            return dict((k.replace('.', '_'), str(v))
                        for k, v in jobconf.items())

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs(os.environ,
                                  to_env(user_jobconf),
                                  to_env(simulated_jobconf),
                                  self._opts['cmdenv'])
示例#2
0
文件: bin.py 项目: Affirm/mrjob
    def _load_steps(self):
        args = (self._executable(True) + ['--steps'] +
                self._mr_job_extra_args(local=True))
        log.debug('> %s' % cmd_line(args))

        # add . to PYTHONPATH (in case mrjob isn't actually installed)
        env = combine_local_envs(os.environ,
                                 {'PYTHONPATH': os.path.abspath('.')})
        steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
        stdout, stderr = steps_proc.communicate()

        if steps_proc.returncode != 0:
            raise Exception(
                'error getting step information: \n%s' % stderr)

        # on Python 3, convert stdout to str so we can json.loads() it
        if not isinstance(stdout, str):
            stdout = stdout.decode('utf_8')

        try:
            steps = json.loads(stdout)
        except ValueError:
            raise ValueError("Bad --steps response: \n%s" % stdout)

        # verify that this is a proper step description
        if not steps or not stdout:
            raise ValueError('step description is empty!')

        return steps
示例#3
0
文件: bin.py 项目: suzil/mrjob
    def _load_steps(self):
        args = (self._executable(True) + ['--steps'] +
                self._mr_job_extra_args(local=True))
        log.debug('> %s' % cmd_line(args))

        # add . to PYTHONPATH (in case mrjob isn't actually installed)
        env = combine_local_envs(os.environ,
                                 {'PYTHONPATH': os.path.abspath('.')})
        steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
        stdout, stderr = steps_proc.communicate()

        if steps_proc.returncode != 0:
            raise Exception('error getting step information: \n%s' % stderr)

        # on Python 3, convert stdout to str so we can json.loads() it
        if not isinstance(stdout, str):
            stdout = stdout.decode('utf_8')

        try:
            steps = json.loads(stdout)
        except ValueError:
            raise ValueError("Bad --steps response: \n%s" % stdout)

        # verify that this is a proper step description
        if not steps or not stdout:
            raise ValueError('step description is empty!')

        return steps
示例#4
0
文件: sim.py 项目: wanglt311/mrjob
    def _subprocess_env(self, step_num, step_type, task_num, working_dir,
                        **split_kwargs):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        user_jobconf = self._jobconf_for_step(step_num)

        simulated_jobconf = self._simulate_jobconf_for_step(
            step_num, step_type, task_num, working_dir, **split_kwargs)

        def to_env(jobconf):
            return dict(
                (k.replace('.', '_'), str(v)) for k, v in jobconf.items())

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs(os.environ, to_env(user_jobconf),
                                  to_env(simulated_jobconf),
                                  self._opts['cmdenv'])
示例#5
0
 def test_paths(self):
     self.assertEqual(combine_local_envs(
         {'PATH': '/bin:/usr/bin',
          'PYTHONPATH': '/usr/lib/python/site-packages',
          'PS1': '> '},
         {'PATH': '/home/dave/bin',
          'PYTHONPATH': '/home/dave/python',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '}),
         {'PATH': '/home/dave/bin;/bin:/usr/bin',
          'PYTHONPATH': '/home/dave/python;/usr/lib/python/site-packages',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '})
示例#6
0
文件: conf_test.py 项目: gimlids/LTPM
 def test_paths(self):
     assert_equal(combine_local_envs(
         {'PATH': '/bin:/usr/bin',
          'PYTHONPATH': '/usr/lib/python/site-packages',
          'PS1': '> '},
         {'PATH': '/home/dave/bin',
          'PYTHONPATH': '/home/dave/python',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '}),
         {'PATH': '/home/dave/bin;/bin:/usr/bin',
          'PYTHONPATH': '/home/dave/python;/usr/lib/python/site-packages',
          'CLASSPATH': '/home/dave/java',
          'PS1': '\w> '})
示例#7
0
    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        """
        if self._steps is None:
            if not self._script_path:
                self._steps = []
            else:
                args = (self._executable(True) + ['--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                         {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception(
                        'error getting step information: \n%s' % stderr)

                # on Python 3, convert stdout to str so we can json.loads() it
                if not isinstance(stdout, str):
                    stdout = stdout.decode('utf_8')

                try:
                    steps = json.loads(stdout)
                except ValueError:
                    raise ValueError("Bad --steps response: \n%s" % stdout)

                # verify that this is a proper step description
                if not steps or not stdout:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step['type'] not in STEP_TYPES:
                        raise ValueError(
                            'unexpected step type %r in steps %r' % (
                                step['type'], stdout))

                self._steps = steps

        return self._steps
示例#8
0
    def _get_steps(self):
        """Call the mr_job to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output like ['MR', 'M']
        (two steps, second only has a mapper)

        We'll cache the result (so you can call _get_steps() as many times
        as you want)
        """
        if self._steps is None:
            if not self._script:
                self._steps = []
            else:
                # don't use self._opts['python_bin'] because that
                # refers to the python binary to use inside Hadoop
                python_bin = sys.executable or 'python'
                args = ([python_bin, self._script['path'], '--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                         {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception('error getting step information: %s',
                                    stderr)

                steps = stdout.strip().split(' ')

                # verify that this is a proper step description
                if not steps:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step not in ('MR', 'M'):
                        raise ValueError(
                            'unexpected step type %r in steps %r' %
                            (step, stdout))

                self._steps = steps

        return self._steps
示例#9
0
文件: runner.py 项目: AntonKast/mrjob
    def _get_steps(self):
        """Call the mr_job to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output like ['MR', 'M']
        (two steps, second only has a mapper)

        We'll cache the result (so you can call _get_steps() as many times
        as you want)
        """
        if self._steps is None:
            if not self._script:
                self._steps = []
            else:
                # don't use self._opts['python_bin'] because that
                # refers to the python binary to use inside Hadoop
                python_bin = sys.executable or 'python'
                args = ([python_bin, self._script['path'], '--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                   {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception(
                        'error getting step information: %s', stderr)

                steps = stdout.strip().split(' ')

                # verify that this is a proper step description
                if not steps:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step not in ('MR', 'M'):
                        raise ValueError(
                            'unexpected step type %r in steps %r' %
                                         (step, stdout))

                self._steps = steps

        return self._steps
示例#10
0
文件: test_conf.py 项目: nyccto/mrjob
 def test_paths(self):
     self.assertEqual(
         combine_local_envs(
             {"PATH": "/bin:/usr/bin", "PYTHONPATH": "/usr/lib/python/site-packages", "PS1": "> "},
             {
                 "PATH": "/home/dave/bin",
                 "PYTHONPATH": "/home/dave/python",
                 "CLASSPATH": "/home/dave/java",
                 "PS1": "\w> ",
             },
         ),
         {
             "PATH": "/home/dave/bin;/bin:/usr/bin",
             "PYTHONPATH": "/home/dave/python;/usr/lib/python/site-packages",
             "CLASSPATH": "/home/dave/java",
             "PS1": "\w> ",
         },
     )
示例#11
0
    def _subprocess_env(self,
                        step_type,
                        step_num,
                        task_num,
                        input_file=None,
                        input_start=None,
                        input_length=None):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        version = self.get_hadoop_version()

        jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in self._opts['jobconf'].iteritems())

        internal_jobconf = self._simulate_jobconf_for_step(
            step_type,
            step_num,
            task_num,
            input_file=input_file,
            input_start=input_start,
            input_length=input_length)

        internal_jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in internal_jobconf.iteritems())

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs(os.environ, jobconf_env,
                                  internal_jobconf_env, self._get_cmdenv())
示例#12
0
    def _run_step(self, step_num, step_type, input_path, output_path,
                  working_dir, env):
        step = self._get_step(step_num)

        if step_type == 'mapper':
            procs_args = self._mapper_arg_chain(step, step_num, input_path)
        elif step_type == 'reducer':
            procs_args = self._reducer_arg_chain(step, step_num, input_path)

        # add . to PYTHONPATH (in case mrjob isn't actually installed)
        # we need this to access mrjob.cat

        # if we wanted, we could move read_file() and read_input()
        # to mrjob.cat and make it a standalone script
        env = combine_local_envs(env, {'PYTHONPATH': abspath('.')})

        proc_dicts = self._invoke_processes(procs_args, output_path,
                                            working_dir, env)
        self._all_proc_dicts.extend(proc_dicts)
示例#13
0
文件: runner.py 项目: derwiki/mrjob
    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`. Results are
        cached to avoid round trips to a subprocess.
        """
        if self._steps is None:
            if not self._script:
                self._steps = []
            else:
                args = (self._opts['steps_python_bin'] +
                        [self._script['path'], '--steps'] +
                        self._mr_job_extra_args(local=True))
                log.debug('> %s' % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ,
                                   {'PYTHONPATH': os.path.abspath('.')})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception(
                        'error getting step information: %s', stderr)

                try:
                    steps = json.loads(stdout)
                except json.JSONDecodeError:
                    raise ValueError("Bad --steps response: \n%s" % stdout)

                # verify that this is a proper step description
                if not steps or not stdout:
                    raise ValueError('step description is empty!')
                for step in steps:
                    if step['type'] not in STEP_TYPES:
                        raise ValueError(
                            'unexpected step type %r in steps %r' %
                                         (step['type'], stdout))

                self._steps = steps

        return self._steps
示例#14
0
文件: local.py 项目: nyccto/mrjob
    def _subprocess_env(self, step_type, step_num, task_num, input_file=None,
                        input_start=None, input_length=None):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        version = self.get_hadoop_version()

        jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in self._opts['jobconf'].iteritems())

        internal_jobconf = self._simulate_jobconf_for_step(
            step_type, step_num, task_num, input_file=input_file,
            input_start=input_start, input_length=input_length)

        internal_jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in internal_jobconf.iteritems())

        ironpython_env = {'IRONPYTHONPATH': os.getcwd()} if is_ironpython \
                         else {}

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs({'PYTHONPATH': os.getcwd()},
                                  ironpython_env,
                                  os.environ,
                                  jobconf_env,
                                  internal_jobconf_env,
                                  self._get_cmdenv())
示例#15
0
    def _get_steps(self):
        """Call the job script to find out how many steps it has, and whether
        there are mappers and reducers for each step. Validate its
        output.

        Returns output as described in :ref:`steps-format`.

        Results are cached, so call this as many times as you want.
        """
        if self._steps is None:
            if not self._script_path:
                self._steps = []
            else:
                args = self._executable(True) + ["--steps"] + self._mr_job_extra_args(local=True)
                log.debug("> %s" % cmd_line(args))
                # add . to PYTHONPATH (in case mrjob isn't actually installed)
                env = combine_local_envs(os.environ, {"PYTHONPATH": os.path.abspath(".")})
                steps_proc = Popen(args, stdout=PIPE, stderr=PIPE, env=env)
                stdout, stderr = steps_proc.communicate()

                if steps_proc.returncode != 0:
                    raise Exception("error getting step information: \n%s" % stderr)

                try:
                    steps = json.loads(stdout)
                except JSONDecodeError:
                    raise ValueError("Bad --steps response: \n%s" % stdout)

                # verify that this is a proper step description
                if not steps or not stdout:
                    raise ValueError("step description is empty!")
                for step in steps:
                    if step["type"] not in STEP_TYPES:
                        raise ValueError("unexpected step type %r in steps %r" % (step["type"], stdout))

                self._steps = steps

        return self._steps
示例#16
0
文件: local.py 项目: boursier/mrjob
    def _invoke_step(self, args, outfile_name, env=None):
        """Run the given command, outputting into outfile, and reading
        from the previous outfile (or, for the first step, from our
        original output files).

        outfile is a path relative to our local tmp dir. commands are run
        inside self._working_dir

        We'll intelligently handle stderr from the process.
        """
        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        env = combine_local_envs(
            {'PYTHONPATH': os.getcwd()},
            os.environ,
            self._get_cmdenv(),
            env or {})

        # decide where to get input
        if self._prev_outfile is not None:
            input_paths = [self._prev_outfile]
        else:
            input_paths = []
            for path in self._input_paths:
                if path == '-':
                    input_paths.append(self._dump_stdin_to_local_file())
                else:
                    input_paths.append(path)

        # add input to the command line
        for path in input_paths:
            args.append(os.path.abspath(path))

        log.info('> %s' % cmd_line(args))

        # set up outfile
        outfile = os.path.join(self._get_local_tmp_dir(), outfile_name)
        log.info('writing to %s' % outfile)
        log.debug('')

        self._prev_outfile = outfile
        write_to = open(outfile, 'w')

        # run the process
        proc = Popen(args, stdout=write_to, stderr=PIPE,
                     cwd=self._working_dir, env=env)

        # handle counters, status msgs, and other stuff on stderr
        stderr_lines = self._process_stderr_from_script(proc.stderr)
        tb_lines = find_python_traceback(stderr_lines)

        self._print_counters()

        returncode = proc.wait()
        if returncode != 0:
            # try to throw a useful exception
            if tb_lines:
                raise Exception(
                    'Command %r returned non-zero exit status %d:\n%s' %
                    (args, returncode, ''.join(tb_lines)))
            else:
                raise Exception(
                    'Command %r returned non-zero exit status %d: %s' %
                    (args, returncode))

        # flush file descriptors
        write_to.flush()
示例#17
0
    def _invoke_step(self, args, outfile_name, env=None):
        """Run the given command, outputting into outfile, and reading
        from the previous outfile (or, for the first step, from our
        original output files).

        outfile is a path relative to our local tmp dir. commands are run
        inside self._working_dir

        We'll intelligently handle stderr from the process.
        """
        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        env = combine_local_envs({'PYTHONPATH': os.getcwd()}, os.environ,
                                 self._get_cmdenv(), env or {})

        # decide where to get input
        if self._prev_outfile is not None:
            input_paths = [self._prev_outfile]
        else:
            input_paths = []
            for path in self._input_paths:
                if path == '-':
                    input_paths.append(self._dump_stdin_to_local_file())
                else:
                    input_paths.append(path)

        # add input to the command line
        for path in input_paths:
            args.append(os.path.abspath(path))

        log.info('> %s' % cmd_line(args))

        # set up outfile
        outfile = os.path.join(self._get_local_tmp_dir(), outfile_name)
        log.info('writing to %s' % outfile)
        log.debug('')

        self._prev_outfile = outfile
        write_to = open(outfile, 'w')

        # run the process
        proc = Popen(args,
                     stdout=write_to,
                     stderr=PIPE,
                     cwd=self._working_dir,
                     env=env)

        # handle counters, status msgs, and other stuff on stderr
        stderr_lines = self._process_stderr_from_script(proc.stderr)
        tb_lines = find_python_traceback(stderr_lines)

        self._print_counters()

        returncode = proc.wait()
        if returncode != 0:
            # try to throw a useful exception
            if tb_lines:
                raise Exception(
                    'Command %r returned non-zero exit status %d:\n%s' %
                    (args, returncode, ''.join(tb_lines)))
            else:
                raise Exception(
                    'Command %r returned non-zero exit status %d: %s' %
                    (args, returncode))

        # flush file descriptors
        write_to.flush()